err.no Git - linux-2.6/blob - fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include <linux/capability.h>
  20
  21 #include "xfs.h"
  22 #include "xfs_fs.h"
  23 #include "xfs_types.h"
  24 #include "xfs_bit.h"
  25 #include "xfs_log.h"
  26 #include "xfs_inum.h"
  27 #include "xfs_trans.h"
  28 #include "xfs_sb.h"
  29 #include "xfs_ag.h"
  30 #include "xfs_dir.h"
  31 #include "xfs_dir2.h"
  32 #include "xfs_dmapi.h"
  33 #include "xfs_mount.h"
  34 #include "xfs_da_btree.h"
  35 #include "xfs_bmap_btree.h"
  36 #include "xfs_alloc_btree.h"
  37 #include "xfs_ialloc_btree.h"
  38 #include "xfs_dir_sf.h"
  39 #include "xfs_dir2_sf.h"
  40 #include "xfs_attr_sf.h"
  41 #include "xfs_dinode.h"
  42 #include "xfs_inode.h"
  43 #include "xfs_inode_item.h"
  44 #include "xfs_dir_leaf.h"
  45 #include "xfs_itable.h"
  46 #include "xfs_btree.h"
  47 #include "xfs_ialloc.h"
  48 #include "xfs_alloc.h"
  49 #include "xfs_bmap.h"
  50 #include "xfs_attr.h"
  51 #include "xfs_rw.h"
  52 #include "xfs_error.h"
  53 #include "xfs_quota.h"
  54 #include "xfs_utils.h"
  55 #include "xfs_rtalloc.h"
  56 #include "xfs_refcache.h"
  57 #include "xfs_trans_space.h"
  58 #include "xfs_log_priv.h"
  59 #include "xfs_mac.h"
  60
  61
  62 /*
  63  * The maximum pathlen is 1024 bytes. Since the minimum file system
  64  * blocksize is 512 bytes, we can get a max of 2 extents back from
  65  * bmapi.
  66  */
  67 #define SYMLINK_MAPS 2
  68
  69 /*
  70  * For xfs, we check that the file isn't too big to be opened by this kernel.
  71  * No other open action is required for regular files.  Devices are handled
  72  * through the specfs file system, pipes through fifofs.  Device and
  73  * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
  74  * when a new vnode is first looked up or created.
  75  */
  76 STATIC int
  77 xfs_open(
  78         bhv_desc_t      *bdp,
  79         cred_t          *credp)
  80 {
  81         int             mode;
  82         vnode_t         *vp;
  83         xfs_inode_t     *ip;
  84
  85         vp = BHV_TO_VNODE(bdp);
  86         ip = XFS_BHVTOI(bdp);
  87
  88         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  89                 return XFS_ERROR(EIO);
  90
  91         /*
  92          * If it's a directory with any blocks, read-ahead block 0
  93          * as we're almost certain to have the next operation be a read there.
  94          */
  95         if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
  96                 mode = xfs_ilock_map_shared(ip);
  97                 if (ip->i_d.di_nextents > 0)
  98                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  99                 xfs_iunlock(ip, mode);
 100         }
 101         return 0;
 102 }
 103
 104
 105 /*
 106  * xfs_getattr
 107  */
 108 STATIC int
 109 xfs_getattr(
 110         bhv_desc_t      *bdp,
 111         vattr_t         *vap,
 112         int             flags,
 113         cred_t          *credp)
 114 {
 115         xfs_inode_t     *ip;
 116         xfs_mount_t     *mp;
 117         vnode_t         *vp;
 118
 119         vp  = BHV_TO_VNODE(bdp);
 120         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 121
 122         ip = XFS_BHVTOI(bdp);
 123         mp = ip->i_mount;
 124
 125         if (XFS_FORCED_SHUTDOWN(mp))
 126                 return XFS_ERROR(EIO);
 127
 128         if (!(flags & ATTR_LAZY))
 129                 xfs_ilock(ip, XFS_ILOCK_SHARED);
 130
 131         vap->va_size = ip->i_d.di_size;
 132         if (vap->va_mask == XFS_AT_SIZE)
 133                 goto all_done;
 134
 135         vap->va_nblocks =
 136                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 137         vap->va_nodeid = ip->i_ino;
 138 #if XFS_BIG_INUMS
 139         vap->va_nodeid += mp->m_inoadd;
 140 #endif
 141         vap->va_nlink = ip->i_d.di_nlink;
 142
 143         /*
 144          * Quick exit for non-stat callers
 145          */
 146         if ((vap->va_mask &
 147             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 148               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 149                 goto all_done;
 150
 151         /*
 152          * Copy from in-core inode.
 153          */
 154         vap->va_mode = ip->i_d.di_mode;
 155         vap->va_uid = ip->i_d.di_uid;
 156         vap->va_gid = ip->i_d.di_gid;
 157         vap->va_projid = ip->i_d.di_projid;
 158
 159         /*
 160          * Check vnode type block/char vs. everything else.
 161          */
 162         switch (ip->i_d.di_mode & S_IFMT) {
 163         case S_IFBLK:
 164         case S_IFCHR:
 165                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 166                 vap->va_blocksize = BLKDEV_IOSIZE;
 167                 break;
 168         default:
 169                 vap->va_rdev = 0;
 170
 171                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 172                         vap->va_blocksize = xfs_preferred_iosize(mp);
 173                 } else {
 174
 175                         /*
 176                          * If the file blocks are being allocated from a
 177                          * realtime partition, then return the inode's
 178                          * realtime extent size or the realtime volume's
 179                          * extent size.
 180                          */
 181                         vap->va_blocksize = ip->i_d.di_extsize ?
 182                                 (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
 183                                 (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
 184                 }
 185                 break;
 186         }
 187
 188         vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
 189         vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
 190         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 191         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 192         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 193         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 194
 195         /*
 196          * Exit for stat callers.  See if any of the rest of the fields
 197          * to be filled in are needed.
 198          */
 199         if ((vap->va_mask &
 200              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 201               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 202                 goto all_done;
 203
 204         /*
 205          * Convert di_flags to xflags.
 206          */
 207         vap->va_xflags = xfs_ip2xflags(ip);
 208
 209         /*
 210          * Exit for inode revalidate.  See if any of the rest of
 211          * the fields to be filled in are needed.
 212          */
 213         if ((vap->va_mask &
 214              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 215               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 216                 goto all_done;
 217
 218         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 219         vap->va_nextents =
 220                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 221                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 222                         ip->i_d.di_nextents;
 223         if (ip->i_afp)
 224                 vap->va_anextents =
 225                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 226                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 227                                  ip->i_d.di_anextents;
 228         else
 229                 vap->va_anextents = 0;
 230         vap->va_gen = ip->i_d.di_gen;
 231
 232  all_done:
 233         if (!(flags & ATTR_LAZY))
 234                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 235         return 0;
 236 }
 237
 238
 239 /*
 240  * xfs_setattr
 241  */
 242 int
 243 xfs_setattr(
 244         bhv_desc_t              *bdp,
 245         vattr_t                 *vap,
 246         int                     flags,
 247         cred_t                  *credp)
 248 {
 249         xfs_inode_t             *ip;
 250         xfs_trans_t             *tp;
 251         xfs_mount_t             *mp;
 252         int                     mask;
 253         int                     code;
 254         uint                    lock_flags;
 255         uint                    commit_flags=0;
 256         uid_t                   uid=0, iuid=0;
 257         gid_t                   gid=0, igid=0;
 258         int                     timeflags = 0;
 259         vnode_t                 *vp;
 260         xfs_prid_t              projid=0, iprojid=0;
 261         int                     mandlock_before, mandlock_after;
 262         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 263         int                     file_owner;
 264         int                     need_iolock = 1;
 265
 266         vp = BHV_TO_VNODE(bdp);
 267         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 268
 269         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 270                 return XFS_ERROR(EROFS);
 271
 272         /*
 273          * Cannot set certain attributes.
 274          */
 275         mask = vap->va_mask;
 276         if (mask & XFS_AT_NOSET) {
 277                 return XFS_ERROR(EINVAL);
 278         }
 279
 280         ip = XFS_BHVTOI(bdp);
 281         mp = ip->i_mount;
 282
 283         if (XFS_FORCED_SHUTDOWN(mp))
 284                 return XFS_ERROR(EIO);
 285
 286         /*
 287          * Timestamps do not need to be logged and hence do not
 288          * need to be done within a transaction.
 289          */
 290         if (mask & XFS_AT_UPDTIMES) {
 291                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 292                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 293                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 294                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 295                 xfs_ichgtime(ip, timeflags);
 296                 return 0;
 297         }
 298
 299         olddquot1 = olddquot2 = NULL;
 300         udqp = gdqp = NULL;
 301
 302         /*
 303          * If disk quotas is on, we make sure that the dquots do exist on disk,
 304          * before we start any other transactions. Trying to do this later
 305          * is messy. We don't care to take a readlock to look at the ids
 306          * in inode here, because we can't hold it across the trans_reserve.
 307          * If the IDs do change before we take the ilock, we're covered
 308          * because the i_*dquot fields will get updated anyway.
 309          */
 310         if (XFS_IS_QUOTA_ON(mp) &&
 311             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 312                 uint    qflags = 0;
 313
 314                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 315                         uid = vap->va_uid;
 316                         qflags |= XFS_QMOPT_UQUOTA;
 317                 } else {
 318                         uid = ip->i_d.di_uid;
 319                 }
 320                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 321                         gid = vap->va_gid;
 322                         qflags |= XFS_QMOPT_GQUOTA;
 323                 }  else {
 324                         gid = ip->i_d.di_gid;
 325                 }
 326                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 327                         projid = vap->va_projid;
 328                         qflags |= XFS_QMOPT_PQUOTA;
 329                 }  else {
 330                         projid = ip->i_d.di_projid;
 331                 }
 332                 /*
 333                  * We take a reference when we initialize udqp and gdqp,
 334                  * so it is important that we never blindly double trip on
 335                  * the same variable. See xfs_create() for an example.
 336                  */
 337                 ASSERT(udqp == NULL);
 338                 ASSERT(gdqp == NULL);
 339                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 340                                          &udqp, &gdqp);
 341                 if (code)
 342                         return (code);
 343         }
 344
 345         /*
 346          * For the other attributes, we acquire the inode lock and
 347          * first do an error checking pass.
 348          */
 349         tp = NULL;
 350         lock_flags = XFS_ILOCK_EXCL;
 351         ASSERT(flags & ATTR_NOLOCK ? flags & ATTR_DMI : 1);
 352         if (flags & ATTR_NOLOCK)
 353                 need_iolock = 0;
 354         if (!(mask & XFS_AT_SIZE)) {
 355                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 356                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 357                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 358                         commit_flags = 0;
 359                         if ((code = xfs_trans_reserve(tp, 0,
 360                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 361                                                      0, 0))) {
 362                                 lock_flags = 0;
 363                                 goto error_return;
 364                         }
 365                 }
 366         } else {
 367                 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
 368                     !(flags & ATTR_DMI)) {
 369                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 370                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 371                                 vap->va_size, 0, dmflags, NULL);
 372                         if (code) {
 373                                 lock_flags = 0;
 374                                 goto error_return;
 375                         }
 376                 }
 377                 if (need_iolock)
 378                         lock_flags |= XFS_IOLOCK_EXCL;
 379         }
 380
 381         xfs_ilock(ip, lock_flags);
 382
 383         /* boolean: are we the file owner? */
 384         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 385
 386         /*
 387          * Change various properties of a file.
 388          * Only the owner or users with CAP_FOWNER
 389          * capability may do these things.
 390          */
 391         if (mask &
 392             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 393              XFS_AT_GID|XFS_AT_PROJID)) {
 394                 /*
 395                  * CAP_FOWNER overrides the following restrictions:
 396                  *
 397                  * The user ID of the calling process must be equal
 398                  * to the file owner ID, except in cases where the
 399                  * CAP_FSETID capability is applicable.
 400                  */
 401                 if (!file_owner && !capable(CAP_FOWNER)) {
 402                         code = XFS_ERROR(EPERM);
 403                         goto error_return;
 404                 }
 405
 406                 /*
 407                  * CAP_FSETID overrides the following restrictions:
 408                  *
 409                  * The effective user ID of the calling process shall match
 410                  * the file owner when setting the set-user-ID and
 411                  * set-group-ID bits on that file.
 412                  *
 413                  * The effective group ID or one of the supplementary group
 414                  * IDs of the calling process shall match the group owner of
 415                  * the file when setting the set-group-ID bit on that file
 416                  */
 417                 if (mask & XFS_AT_MODE) {
 418                         mode_t m = 0;
 419
 420                         if ((vap->va_mode & S_ISUID) && !file_owner)
 421                                 m |= S_ISUID;
 422                         if ((vap->va_mode & S_ISGID) &&
 423                             !in_group_p((gid_t)ip->i_d.di_gid))
 424                                 m |= S_ISGID;
 425 #if 0
 426                         /* Linux allows this, Irix doesn't. */
 427                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 428                                 m |= S_ISVTX;
 429 #endif
 430                         if (m && !capable(CAP_FSETID))
 431                                 vap->va_mode &= ~m;
 432                 }
 433         }
 434
 435         /*
 436          * Change file ownership.  Must be the owner or privileged.
 437          * If the system was configured with the "restricted_chown"
 438          * option, the owner is not permitted to give away the file,
 439          * and can change the group id only to a group of which he
 440          * or she is a member.
 441          */
 442         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 443                 /*
 444                  * These IDs could have changed since we last looked at them.
 445                  * But, we're assured that if the ownership did change
 446                  * while we didn't have the inode locked, inode's dquot(s)
 447                  * would have changed also.
 448                  */
 449                 iuid = ip->i_d.di_uid;
 450                 iprojid = ip->i_d.di_projid;
 451                 igid = ip->i_d.di_gid;
 452                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 453                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 454                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 455                          iprojid;
 456
 457                 /*
 458                  * CAP_CHOWN overrides the following restrictions:
 459                  *
 460                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 461                  * shall override the restriction that a process cannot
 462                  * change the user ID of a file it owns and the restriction
 463                  * that the group ID supplied to the chown() function
 464                  * shall be equal to either the group ID or one of the
 465                  * supplementary group IDs of the calling process.
 466                  */
 467                 if (restricted_chown &&
 468                     (iuid != uid || (igid != gid &&
 469                                      !in_group_p((gid_t)gid))) &&
 470                     !capable(CAP_CHOWN)) {
 471                         code = XFS_ERROR(EPERM);
 472                         goto error_return;
 473                 }
 474                 /*
 475                  * Do a quota reservation only if uid/projid/gid is actually
 476                  * going to change.
 477                  */
 478                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 479                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 480                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 481                         ASSERT(tp);
 482                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 483                                                 capable(CAP_FOWNER) ?
 484                                                 XFS_QMOPT_FORCE_RES : 0);
 485                         if (code)       /* out of quota */
 486                                 goto error_return;
 487                 }
 488         }
 489
 490         /*
 491          * Truncate file.  Must have write permission and not be a directory.
 492          */
 493         if (mask & XFS_AT_SIZE) {
 494                 /* Short circuit the truncate case for zero length files */
 495                 if ((vap->va_size == 0) &&
 496                    (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
 497                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 498                         lock_flags &= ~XFS_ILOCK_EXCL;
 499                         if (mask & XFS_AT_CTIME)
 500                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 501                         code = 0;
 502                         goto error_return;
 503                 }
 504
 505                 if (VN_ISDIR(vp)) {
 506                         code = XFS_ERROR(EISDIR);
 507                         goto error_return;
 508                 } else if (!VN_ISREG(vp)) {
 509                         code = XFS_ERROR(EINVAL);
 510                         goto error_return;
 511                 }
 512                 /*
 513                  * Make sure that the dquots are attached to the inode.
 514                  */
 515                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 516                         goto error_return;
 517         }
 518
 519         /*
 520          * Change file access or modified times.
 521          */
 522         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 523                 if (!file_owner) {
 524                         if ((flags & ATTR_UTIME) &&
 525                             !capable(CAP_FOWNER)) {
 526                                 code = XFS_ERROR(EPERM);
 527                                 goto error_return;
 528                         }
 529                 }
 530         }
 531
 532         /*
 533          * Change extent size or realtime flag.
 534          */
 535         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 536                 /*
 537                  * Can't change extent size if any extents are allocated.
 538                  */
 539                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 540                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 541                      vap->va_extsize) ) {
 542                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 543                         goto error_return;
 544                 }
 545
 546                 /*
 547                  * Can't set extent size unless the file is marked, or
 548                  * about to be marked as a realtime file.
 549                  *
 550                  * This check will be removed when fixed size extents
 551                  * with buffered data writes is implemented.
 552                  *
 553                  */
 554                 if ((mask & XFS_AT_EXTSIZE)                     &&
 555                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 556                      vap->va_extsize) &&
 557                     (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 558                        ((mask & XFS_AT_XFLAGS) &&
 559                         (vap->va_xflags & XFS_XFLAG_REALTIME))))) {
 560                         code = XFS_ERROR(EINVAL);
 561                         goto error_return;
 562                 }
 563
 564                 /*
 565                  * Can't change realtime flag if any extents are allocated.
 566                  */
 567                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 568                     (mask & XFS_AT_XFLAGS) &&
 569                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 570                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 571                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 572                         goto error_return;
 573                 }
 574                 /*
 575                  * Extent size must be a multiple of the appropriate block
 576                  * size, if set at all.
 577                  */
 578                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 579                         xfs_extlen_t    size;
 580
 581                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 582                             ((mask & XFS_AT_XFLAGS) &&
 583                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 584                                 size = mp->m_sb.sb_rextsize <<
 585                                        mp->m_sb.sb_blocklog;
 586                         } else {
 587                                 size = mp->m_sb.sb_blocksize;
 588                         }
 589                         if (vap->va_extsize % size) {
 590                                 code = XFS_ERROR(EINVAL);
 591                                 goto error_return;
 592                         }
 593                 }
 594                 /*
 595                  * If realtime flag is set then must have realtime data.
 596                  */
 597                 if ((mask & XFS_AT_XFLAGS) &&
 598                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 599                         if ((mp->m_sb.sb_rblocks == 0) ||
 600                             (mp->m_sb.sb_rextsize == 0) ||
 601                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 602                                 code = XFS_ERROR(EINVAL);
 603                                 goto error_return;
 604                         }
 605                 }
 606
 607                 /*
 608                  * Can't modify an immutable/append-only file unless
 609                  * we have appropriate permission.
 610                  */
 611                 if ((mask & XFS_AT_XFLAGS) &&
 612                     (ip->i_d.di_flags &
 613                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 614                      (vap->va_xflags &
 615                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 616                     !capable(CAP_LINUX_IMMUTABLE)) {
 617                         code = XFS_ERROR(EPERM);
 618                         goto error_return;
 619                 }
 620         }
 621
 622         /*
 623          * Now we can make the changes.  Before we join the inode
 624          * to the transaction, if XFS_AT_SIZE is set then take care of
 625          * the part of the truncation that must be done without the
 626          * inode lock.  This needs to be done before joining the inode
 627          * to the transaction, because the inode cannot be unlocked
 628          * once it is a part of the transaction.
 629          */
 630         if (mask & XFS_AT_SIZE) {
 631                 code = 0;
 632                 if ((vap->va_size > ip->i_d.di_size) &&
 633                     (flags & ATTR_NOSIZETOK) == 0) {
 634                         code = xfs_igrow_start(ip, vap->va_size, credp);
 635                 }
 636                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 637                 if (!code)
 638                         code = xfs_itruncate_data(ip, vap->va_size);
 639                 if (code) {
 640                         ASSERT(tp == NULL);
 641                         lock_flags &= ~XFS_ILOCK_EXCL;
 642                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 643                         goto error_return;
 644                 }
 645                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 646                 if ((code = xfs_trans_reserve(tp, 0,
 647                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 648                                              XFS_TRANS_PERM_LOG_RES,
 649                                              XFS_ITRUNCATE_LOG_COUNT))) {
 650                         xfs_trans_cancel(tp, 0);
 651                         if (need_iolock)
 652                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 653                         return code;
 654                 }
 655                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 656                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 657         }
 658
 659         if (tp) {
 660                 xfs_trans_ijoin(tp, ip, lock_flags);
 661                 xfs_trans_ihold(tp, ip);
 662         }
 663
 664         /* determine whether mandatory locking mode changes */
 665         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 666
 667         /*
 668          * Truncate file.  Must have write permission and not be a directory.
 669          */
 670         if (mask & XFS_AT_SIZE) {
 671                 if (vap->va_size > ip->i_d.di_size) {
 672                         xfs_igrow_finish(tp, ip, vap->va_size,
 673                             !(flags & ATTR_DMI));
 674                 } else if ((vap->va_size <= ip->i_d.di_size) ||
 675                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 676                         /*
 677                          * signal a sync transaction unless
 678                          * we're truncating an already unlinked
 679                          * file on a wsync filesystem
 680                          */
 681                         code = xfs_itruncate_finish(&tp, ip,
 682                                             (xfs_fsize_t)vap->va_size,
 683                                             XFS_DATA_FORK,
 684                                             ((ip->i_d.di_nlink != 0 ||
 685                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 686                                              ? 1 : 0));
 687                         if (code) {
 688                                 goto abort_return;
 689                         }
 690                 }
 691                 /*
 692                  * Have to do this even if the file's size doesn't change.
 693                  */
 694                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 695         }
 696
 697         /*
 698          * Change file access modes.
 699          */
 700         if (mask & XFS_AT_MODE) {
 701                 ip->i_d.di_mode &= S_IFMT;
 702                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 703
 704                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 705                 timeflags |= XFS_ICHGTIME_CHG;
 706         }
 707
 708         /*
 709          * Change file ownership.  Must be the owner or privileged.
 710          * If the system was configured with the "restricted_chown"
 711          * option, the owner is not permitted to give away the file,
 712          * and can change the group id only to a group of which he
 713          * or she is a member.
 714          */
 715         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 716                 /*
 717                  * CAP_FSETID overrides the following restrictions:
 718                  *
 719                  * The set-user-ID and set-group-ID bits of a file will be
 720                  * cleared upon successful return from chown()
 721                  */
 722                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 723                     !capable(CAP_FSETID)) {
 724                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 725                 }
 726
 727                 /*
 728                  * Change the ownerships and register quota modifications
 729                  * in the transaction.
 730                  */
 731                 if (iuid != uid) {
 732                         if (XFS_IS_UQUOTA_ON(mp)) {
 733                                 ASSERT(mask & XFS_AT_UID);
 734                                 ASSERT(udqp);
 735                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 736                                                         &ip->i_udquot, udqp);
 737                         }
 738                         ip->i_d.di_uid = uid;
 739                 }
 740                 if (igid != gid) {
 741                         if (XFS_IS_GQUOTA_ON(mp)) {
 742                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 743                                 ASSERT(mask & XFS_AT_GID);
 744                                 ASSERT(gdqp);
 745                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 746                                                         &ip->i_gdquot, gdqp);
 747                         }
 748                         ip->i_d.di_gid = gid;
 749                 }
 750                 if (iprojid != projid) {
 751                         if (XFS_IS_PQUOTA_ON(mp)) {
 752                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 753                                 ASSERT(mask & XFS_AT_PROJID);
 754                                 ASSERT(gdqp);
 755                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 756                                                         &ip->i_gdquot, gdqp);
 757                         }
 758                         ip->i_d.di_projid = projid;
 759                         /*
 760                          * We may have to rev the inode as well as
 761                          * the superblock version number since projids didn't
 762                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 763                          */
 764                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 765                                 xfs_bump_ino_vers2(tp, ip);
 766                 }
 767
 768                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 769                 timeflags |= XFS_ICHGTIME_CHG;
 770         }
 771
 772
 773         /*
 774          * Change file access or modified times.
 775          */
 776         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 777                 if (mask & XFS_AT_ATIME) {
 778                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 779                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 780                         ip->i_update_core = 1;
 781                         timeflags &= ~XFS_ICHGTIME_ACC;
 782                 }
 783                 if (mask & XFS_AT_MTIME) {
 784                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 785                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 786                         timeflags &= ~XFS_ICHGTIME_MOD;
 787                         timeflags |= XFS_ICHGTIME_CHG;
 788                 }
 789                 if (tp && (flags & ATTR_UTIME))
 790                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 791         }
 792
 793         /*
 794          * Change XFS-added attributes.
 795          */
 796         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 797                 if (mask & XFS_AT_EXTSIZE) {
 798                         /*
 799                          * Converting bytes to fs blocks.
 800                          */
 801                         ip->i_d.di_extsize = vap->va_extsize >>
 802                                 mp->m_sb.sb_blocklog;
 803                 }
 804                 if (mask & XFS_AT_XFLAGS) {
 805                         uint    di_flags;
 806
 807                         /* can't set PREALLOC this way, just preserve it */
 808                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 809                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 810                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 811                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 812                                 di_flags |= XFS_DIFLAG_APPEND;
 813                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 814                                 di_flags |= XFS_DIFLAG_SYNC;
 815                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 816                                 di_flags |= XFS_DIFLAG_NOATIME;
 817                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 818                                 di_flags |= XFS_DIFLAG_NODUMP;
 819                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 820                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 821                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 822                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 823                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 824                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 825                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 826                         } else {
 827                                 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 828                                         di_flags |= XFS_DIFLAG_REALTIME;
 829                                         ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 830                                 } else {
 831                                         ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 832                                 }
 833                         }
 834                         ip->i_d.di_flags = di_flags;
 835                 }
 836                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 837                 timeflags |= XFS_ICHGTIME_CHG;
 838         }
 839
 840         /*
 841          * Change file inode change time only if XFS_AT_CTIME set
 842          * AND we have been called by a DMI function.
 843          */
 844
 845         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 846                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 847                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 848                 ip->i_update_core = 1;
 849                 timeflags &= ~XFS_ICHGTIME_CHG;
 850         }
 851
 852         /*
 853          * Send out timestamp changes that need to be set to the
 854          * current time.  Not done when called by a DMI function.
 855          */
 856         if (timeflags && !(flags & ATTR_DMI))
 857                 xfs_ichgtime(ip, timeflags);
 858
 859         XFS_STATS_INC(xs_ig_attrchg);
 860
 861         /*
 862          * If this is a synchronous mount, make sure that the
 863          * transaction goes to disk before returning to the user.
 864          * This is slightly sub-optimal in that truncates require
 865          * two sync transactions instead of one for wsync filesytems.
 866          * One for the truncate and one for the timestamps since we
 867          * don't want to change the timestamps unless we're sure the
 868          * truncate worked.  Truncates are less than 1% of the laddis
 869          * mix so this probably isn't worth the trouble to optimize.
 870          */
 871         code = 0;
 872         if (tp) {
 873                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 874                         xfs_trans_set_sync(tp);
 875
 876                 code = xfs_trans_commit(tp, commit_flags, NULL);
 877         }
 878
 879         /*
 880          * If the (regular) file's mandatory locking mode changed, then
 881          * notify the vnode.  We do this under the inode lock to prevent
 882          * racing calls to vop_vnode_change.
 883          */
 884         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 885         if (mandlock_before != mandlock_after) {
 886                 VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
 887                                  mandlock_after);
 888         }
 889
 890         xfs_iunlock(ip, lock_flags);
 891
 892         /*
 893          * Release any dquot(s) the inode had kept before chown.
 894          */
 895         XFS_QM_DQRELE(mp, olddquot1);
 896         XFS_QM_DQRELE(mp, olddquot2);
 897         XFS_QM_DQRELE(mp, udqp);
 898         XFS_QM_DQRELE(mp, gdqp);
 899
 900         if (code) {
 901                 return code;
 902         }
 903
 904         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
 905             !(flags & ATTR_DMI)) {
 906                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 907                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 908                                         0, 0, AT_DELAY_FLAG(flags));
 909         }
 910         return 0;
 911
 912  abort_return:
 913         commit_flags |= XFS_TRANS_ABORT;
 914         /* FALLTHROUGH */
 915  error_return:
 916         XFS_QM_DQRELE(mp, udqp);
 917         XFS_QM_DQRELE(mp, gdqp);
 918         if (tp) {
 919                 xfs_trans_cancel(tp, commit_flags);
 920         }
 921         if (lock_flags != 0) {
 922                 xfs_iunlock(ip, lock_flags);
 923         }
 924         return code;
 925 }
 926
 927
 928 /*
 929  * xfs_access
 930  * Null conversion from vnode mode bits to inode mode bits, as in efs.
 931  */
 932 STATIC int
 933 xfs_access(
 934         bhv_desc_t      *bdp,
 935         int             mode,
 936         cred_t          *credp)
 937 {
 938         xfs_inode_t     *ip;
 939         int             error;
 940
 941         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
 942                                                (inst_t *)__return_address);
 943
 944         ip = XFS_BHVTOI(bdp);
 945         xfs_ilock(ip, XFS_ILOCK_SHARED);
 946         error = xfs_iaccess(ip, mode, credp);
 947         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 948         return error;
 949 }
 950
 951
 952 /*
 953  * xfs_readlink
 954  *
 955  */
 956 STATIC int
 957 xfs_readlink(
 958         bhv_desc_t      *bdp,
 959         uio_t           *uiop,
 960         int             ioflags,
 961         cred_t          *credp)
 962 {
 963         xfs_inode_t     *ip;
 964         int             count;
 965         xfs_off_t       offset;
 966         int             pathlen;
 967         vnode_t         *vp;
 968         int             error = 0;
 969         xfs_mount_t     *mp;
 970         int             nmaps;
 971         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 972         xfs_daddr_t     d;
 973         int             byte_cnt;
 974         int             n;
 975         xfs_buf_t       *bp;
 976
 977         vp = BHV_TO_VNODE(bdp);
 978         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 979
 980         ip = XFS_BHVTOI(bdp);
 981         mp = ip->i_mount;
 982
 983         if (XFS_FORCED_SHUTDOWN(mp))
 984                 return XFS_ERROR(EIO);
 985
 986         xfs_ilock(ip, XFS_ILOCK_SHARED);
 987
 988         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 989
 990         offset = uiop->uio_offset;
 991         count = uiop->uio_resid;
 992
 993         if (offset < 0) {
 994                 error = XFS_ERROR(EINVAL);
 995                 goto error_return;
 996         }
 997         if (count <= 0) {
 998                 error = 0;
 999                 goto error_return;
1000         }
1001
1002         if (!(ioflags & IO_INVIS)) {
1003                 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
1004         }
1005
1006         /*
1007          * See if the symlink is stored inline.
1008          */
1009         pathlen = (int)ip->i_d.di_size;
1010
1011         if (ip->i_df.if_flags & XFS_IFINLINE) {
1012                 error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
1013         }
1014         else {
1015                 /*
1016                  * Symlink not inline.  Call bmap to get it in.
1017                  */
1018                 nmaps = SYMLINK_MAPS;
1019
1020                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1021                                   0, NULL, 0, mval, &nmaps, NULL);
1022
1023                 if (error) {
1024                         goto error_return;
1025                 }
1026
1027                 for (n = 0; n < nmaps; n++) {
1028                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1029                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1030                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1031                                       BTOBB(byte_cnt), 0);
1032                         error = XFS_BUF_GETERROR(bp);
1033                         if (error) {
1034                                 xfs_ioerror_alert("xfs_readlink",
1035                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1036                                 xfs_buf_relse(bp);
1037                                 goto error_return;
1038                         }
1039                         if (pathlen < byte_cnt)
1040                                 byte_cnt = pathlen;
1041                         pathlen -= byte_cnt;
1042
1043                         error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1044                         xfs_buf_relse (bp);
1045                 }
1046
1047         }
1048
1049
1050 error_return:
1051
1052         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1053
1054         return error;
1055 }
1056
1057
1058 /*
1059  * xfs_fsync
1060  *
1061  * This is called to sync the inode and its data out to disk.
1062  * We need to hold the I/O lock while flushing the data, and
1063  * the inode lock while flushing the inode.  The inode lock CANNOT
1064  * be held while flushing the data, so acquire after we're done
1065  * with that.
1066  */
1067 STATIC int
1068 xfs_fsync(
1069         bhv_desc_t      *bdp,
1070         int             flag,
1071         cred_t          *credp,
1072         xfs_off_t       start,
1073         xfs_off_t       stop)
1074 {
1075         xfs_inode_t     *ip;
1076         xfs_trans_t     *tp;
1077         int             error;
1078         int             log_flushed = 0, changed = 1;
1079
1080         vn_trace_entry(BHV_TO_VNODE(bdp),
1081                         __FUNCTION__, (inst_t *)__return_address);
1082
1083         ip = XFS_BHVTOI(bdp);
1084
1085         ASSERT(start >= 0 && stop >= -1);
1086
1087         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1088                 return XFS_ERROR(EIO);
1089
1090         /*
1091          * We always need to make sure that the required inode state
1092          * is safe on disk.  The vnode might be clean but because
1093          * of committed transactions that haven't hit the disk yet.
1094          * Likewise, there could be unflushed non-transactional
1095          * changes to the inode core that have to go to disk.
1096          *
1097          * The following code depends on one assumption:  that
1098          * any transaction that changes an inode logs the core
1099          * because it has to change some field in the inode core
1100          * (typically nextents or nblocks).  That assumption
1101          * implies that any transactions against an inode will
1102          * catch any non-transactional updates.  If inode-altering
1103          * transactions exist that violate this assumption, the
1104          * code breaks.  Right now, it figures that if the involved
1105          * update_* field is clear and the inode is unpinned, the
1106          * inode is clean.  Either it's been flushed or it's been
1107          * committed and the commit has hit the disk unpinning the inode.
1108          * (Note that xfs_inode_item_format() called at commit clears
1109          * the update_* fields.)
1110          */
1111         xfs_ilock(ip, XFS_ILOCK_SHARED);
1112
1113         /* If we are flushing data then we care about update_size
1114          * being set, otherwise we care about update_core
1115          */
1116         if ((flag & FSYNC_DATA) ?
1117                         (ip->i_update_size == 0) :
1118                         (ip->i_update_core == 0)) {
1119                 /*
1120                  * Timestamps/size haven't changed since last inode
1121                  * flush or inode transaction commit.  That means
1122                  * either nothing got written or a transaction
1123                  * committed which caught the updates.  If the
1124                  * latter happened and the transaction hasn't
1125                  * hit the disk yet, the inode will be still
1126                  * be pinned.  If it is, force the log.
1127                  */
1128
1129                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1130
1131                 if (xfs_ipincount(ip)) {
1132                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1133                                       XFS_LOG_FORCE |
1134                                       ((flag & FSYNC_WAIT)
1135                                        ? XFS_LOG_SYNC : 0),
1136                                       &log_flushed);
1137                 } else {
1138                         /*
1139                          * If the inode is not pinned and nothing
1140                          * has changed we don't need to flush the
1141                          * cache.
1142                          */
1143                         changed = 0;
1144                 }
1145                 error = 0;
1146         } else  {
1147                 /*
1148                  * Kick off a transaction to log the inode
1149                  * core to get the updates.  Make it
1150                  * sync if FSYNC_WAIT is passed in (which
1151                  * is done by everybody but specfs).  The
1152                  * sync transaction will also force the log.
1153                  */
1154                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1155                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1156                 if ((error = xfs_trans_reserve(tp, 0,
1157                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1158                                 0, 0, 0)))  {
1159                         xfs_trans_cancel(tp, 0);
1160                         return error;
1161                 }
1162                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1163
1164                 /*
1165                  * Note - it's possible that we might have pushed
1166                  * ourselves out of the way during trans_reserve
1167                  * which would flush the inode.  But there's no
1168                  * guarantee that the inode buffer has actually
1169                  * gone out yet (it's delwri).  Plus the buffer
1170                  * could be pinned anyway if it's part of an
1171                  * inode in another recent transaction.  So we
1172                  * play it safe and fire off the transaction anyway.
1173                  */
1174                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1175                 xfs_trans_ihold(tp, ip);
1176                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1177                 if (flag & FSYNC_WAIT)
1178                         xfs_trans_set_sync(tp);
1179                 error = _xfs_trans_commit(tp, 0, NULL, &log_flushed);
1180
1181                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1182         }
1183
1184         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1185                 /*
1186                  * If the log write didn't issue an ordered tag we need
1187                  * to flush the disk cache for the data device now.
1188                  */
1189                 if (!log_flushed)
1190                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1191
1192                 /*
1193                  * If this inode is on the RT dev we need to flush that
1194                  * cache aswell.
1195                  */
1196                 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1197                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1198         }
1199
1200         return error;
1201 }
1202
1203 /*
1204  * This is called by xfs_inactive to free any blocks beyond eof,
1205  * when the link count isn't zero.
1206  */
1207 STATIC int
1208 xfs_inactive_free_eofblocks(
1209         xfs_mount_t     *mp,
1210         xfs_inode_t     *ip)
1211 {
1212         xfs_trans_t     *tp;
1213         int             error;
1214         xfs_fileoff_t   end_fsb;
1215         xfs_fileoff_t   last_fsb;
1216         xfs_filblks_t   map_len;
1217         int             nimaps;
1218         xfs_bmbt_irec_t imap;
1219
1220         /*
1221          * Figure out if there are any blocks beyond the end
1222          * of the file.  If not, then there is nothing to do.
1223          */
1224         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
1225         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1226         map_len = last_fsb - end_fsb;
1227         if (map_len <= 0)
1228                 return (0);
1229
1230         nimaps = 1;
1231         xfs_ilock(ip, XFS_ILOCK_SHARED);
1232         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1233                           NULL, 0, &imap, &nimaps, NULL);
1234         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1235
1236         if (!error && (nimaps != 0) &&
1237             (imap.br_startblock != HOLESTARTBLOCK)) {
1238                 /*
1239                  * Attach the dquots to the inode up front.
1240                  */
1241                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1242                         return (error);
1243
1244                 /*
1245                  * There are blocks after the end of file.
1246                  * Free them up now by truncating the file to
1247                  * its current size.
1248                  */
1249                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1250
1251                 /*
1252                  * Do the xfs_itruncate_start() call before
1253                  * reserving any log space because
1254                  * itruncate_start will call into the buffer
1255                  * cache and we can't
1256                  * do that within a transaction.
1257                  */
1258                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1259                 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1260                                     ip->i_d.di_size);
1261
1262                 error = xfs_trans_reserve(tp, 0,
1263                                           XFS_ITRUNCATE_LOG_RES(mp),
1264                                           0, XFS_TRANS_PERM_LOG_RES,
1265                                           XFS_ITRUNCATE_LOG_COUNT);
1266                 if (error) {
1267                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1268                         xfs_trans_cancel(tp, 0);
1269                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1270                         return (error);
1271                 }
1272
1273                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1274                 xfs_trans_ijoin(tp, ip,
1275                                 XFS_IOLOCK_EXCL |
1276                                 XFS_ILOCK_EXCL);
1277                 xfs_trans_ihold(tp, ip);
1278
1279                 error = xfs_itruncate_finish(&tp, ip,
1280                                              ip->i_d.di_size,
1281                                              XFS_DATA_FORK,
1282                                              0);
1283                 /*
1284                  * If we get an error at this point we
1285                  * simply don't bother truncating the file.
1286                  */
1287                 if (error) {
1288                         xfs_trans_cancel(tp,
1289                                          (XFS_TRANS_RELEASE_LOG_RES |
1290                                           XFS_TRANS_ABORT));
1291                 } else {
1292                         error = xfs_trans_commit(tp,
1293                                                 XFS_TRANS_RELEASE_LOG_RES,
1294                                                 NULL);
1295                 }
1296                 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1297         }
1298         return (error);
1299 }
1300
1301 /*
1302  * Free a symlink that has blocks associated with it.
1303  */
1304 STATIC int
1305 xfs_inactive_symlink_rmt(
1306         xfs_inode_t     *ip,
1307         xfs_trans_t     **tpp)
1308 {
1309         xfs_buf_t       *bp;
1310         int             committed;
1311         int             done;
1312         int             error;
1313         xfs_fsblock_t   first_block;
1314         xfs_bmap_free_t free_list;
1315         int             i;
1316         xfs_mount_t     *mp;
1317         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1318         int             nmaps;
1319         xfs_trans_t     *ntp;
1320         int             size;
1321         xfs_trans_t     *tp;
1322
1323         tp = *tpp;
1324         mp = ip->i_mount;
1325         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1326         /*
1327          * We're freeing a symlink that has some
1328          * blocks allocated to it.  Free the
1329          * blocks here.  We know that we've got
1330          * either 1 or 2 extents and that we can
1331          * free them all in one bunmapi call.
1332          */
1333         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1334         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1335                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1336                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1337                 xfs_trans_cancel(tp, 0);
1338                 *tpp = NULL;
1339                 return error;
1340         }
1341         /*
1342          * Lock the inode, fix the size, and join it to the transaction.
1343          * Hold it so in the normal path, we still have it locked for
1344          * the second transaction.  In the error paths we need it
1345          * held so the cancel won't rele it, see below.
1346          */
1347         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1348         size = (int)ip->i_d.di_size;
1349         ip->i_d.di_size = 0;
1350         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1351         xfs_trans_ihold(tp, ip);
1352         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1353         /*
1354          * Find the block(s) so we can inval and unmap them.
1355          */
1356         done = 0;
1357         XFS_BMAP_INIT(&free_list, &first_block);
1358         nmaps = sizeof(mval) / sizeof(mval[0]);
1359         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1360                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1361                         &free_list)))
1362                 goto error0;
1363         /*
1364          * Invalidate the block(s).
1365          */
1366         for (i = 0; i < nmaps; i++) {
1367                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1368                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1369                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1370                 xfs_trans_binval(tp, bp);
1371         }
1372         /*
1373          * Unmap the dead block(s) to the free_list.
1374          */
1375         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1376                         &first_block, &free_list, &done)))
1377                 goto error1;
1378         ASSERT(done);
1379         /*
1380          * Commit the first transaction.  This logs the EFI and the inode.
1381          */
1382         if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
1383                 goto error1;
1384         /*
1385          * The transaction must have been committed, since there were
1386          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1387          * The new tp has the extent freeing and EFDs.
1388          */
1389         ASSERT(committed);
1390         /*
1391          * The first xact was committed, so add the inode to the new one.
1392          * Mark it dirty so it will be logged and moved forward in the log as
1393          * part of every commit.
1394          */
1395         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1396         xfs_trans_ihold(tp, ip);
1397         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1398         /*
1399          * Get a new, empty transaction to return to our caller.
1400          */
1401         ntp = xfs_trans_dup(tp);
1402         /*
1403          * Commit the transaction containing extent freeing and EFD's.
1404          * If we get an error on the commit here or on the reserve below,
1405          * we need to unlock the inode since the new transaction doesn't
1406          * have the inode attached.
1407          */
1408         error = xfs_trans_commit(tp, 0, NULL);
1409         tp = ntp;
1410         if (error) {
1411                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1412                 goto error0;
1413         }
1414         /*
1415          * Remove the memory for extent descriptions (just bookkeeping).
1416          */
1417         if (ip->i_df.if_bytes)
1418                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1419         ASSERT(ip->i_df.if_bytes == 0);
1420         /*
1421          * Put an itruncate log reservation in the new transaction
1422          * for our caller.
1423          */
1424         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1425                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1426                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1427                 goto error0;
1428         }
1429         /*
1430          * Return with the inode locked but not joined to the transaction.
1431          */
1432         *tpp = tp;
1433         return 0;
1434
1435  error1:
1436         xfs_bmap_cancel(&free_list);
1437  error0:
1438         /*
1439          * Have to come here with the inode locked and either
1440          * (held and in the transaction) or (not in the transaction).
1441          * If the inode isn't held then cancel would iput it, but
1442          * that's wrong since this is inactive and the vnode ref
1443          * count is 0 already.
1444          * Cancel won't do anything to the inode if held, but it still
1445          * needs to be locked until the cancel is done, if it was
1446          * joined to the transaction.
1447          */
1448         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1449         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1450         *tpp = NULL;
1451         return error;
1452
1453 }
1454
1455 STATIC int
1456 xfs_inactive_symlink_local(
1457         xfs_inode_t     *ip,
1458         xfs_trans_t     **tpp)
1459 {
1460         int             error;
1461
1462         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1463         /*
1464          * We're freeing a symlink which fit into
1465          * the inode.  Just free the memory used
1466          * to hold the old symlink.
1467          */
1468         error = xfs_trans_reserve(*tpp, 0,
1469                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1470                                   0, XFS_TRANS_PERM_LOG_RES,
1471                                   XFS_ITRUNCATE_LOG_COUNT);
1472
1473         if (error) {
1474                 xfs_trans_cancel(*tpp, 0);
1475                 *tpp = NULL;
1476                 return (error);
1477         }
1478         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1479
1480         /*
1481          * Zero length symlinks _can_ exist.
1482          */
1483         if (ip->i_df.if_bytes > 0) {
1484                 xfs_idata_realloc(ip,
1485                                   -(ip->i_df.if_bytes),
1486                                   XFS_DATA_FORK);
1487                 ASSERT(ip->i_df.if_bytes == 0);
1488         }
1489         return (0);
1490 }
1491
1492 /*
1493  *
1494  */
1495 STATIC int
1496 xfs_inactive_attrs(
1497         xfs_inode_t     *ip,
1498         xfs_trans_t     **tpp)
1499 {
1500         xfs_trans_t     *tp;
1501         int             error;
1502         xfs_mount_t     *mp;
1503
1504         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1505         tp = *tpp;
1506         mp = ip->i_mount;
1507         ASSERT(ip->i_d.di_forkoff != 0);
1508         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1509         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1510
1511         error = xfs_attr_inactive(ip);
1512         if (error) {
1513                 *tpp = NULL;
1514                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1515                 return (error); /* goto out*/
1516         }
1517
1518         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1519         error = xfs_trans_reserve(tp, 0,
1520                                   XFS_IFREE_LOG_RES(mp),
1521                                   0, XFS_TRANS_PERM_LOG_RES,
1522                                   XFS_INACTIVE_LOG_COUNT);
1523         if (error) {
1524                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1525                 xfs_trans_cancel(tp, 0);
1526                 *tpp = NULL;
1527                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1528                 return (error);
1529         }
1530
1531         xfs_ilock(ip, XFS_ILOCK_EXCL);
1532         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1533         xfs_trans_ihold(tp, ip);
1534         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1535
1536         ASSERT(ip->i_d.di_anextents == 0);
1537
1538         *tpp = tp;
1539         return (0);
1540 }
1541
1542 STATIC int
1543 xfs_release(
1544         bhv_desc_t      *bdp)
1545 {
1546         xfs_inode_t     *ip;
1547         vnode_t         *vp;
1548         xfs_mount_t     *mp;
1549         int             error;
1550
1551         vp = BHV_TO_VNODE(bdp);
1552         ip = XFS_BHVTOI(bdp);
1553
1554         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
1555                 return 0;
1556         }
1557
1558         /* If this is a read-only mount, don't do this (would generate I/O) */
1559         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1560                 return 0;
1561
1562 #ifdef HAVE_REFCACHE
1563         /* If we are in the NFS reference cache then don't do this now */
1564         if (ip->i_refcache)
1565                 return 0;
1566 #endif
1567
1568         mp = ip->i_mount;
1569
1570         if (ip->i_d.di_nlink != 0) {
1571                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1572                      ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
1573                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1574                     (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) {
1575                         if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1576                                 return (error);
1577                         /* Update linux inode block count after free above */
1578                         LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1579                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1580                 }
1581         }
1582
1583         return 0;
1584 }
1585
1586 /*
1587  * xfs_inactive
1588  *
1589  * This is called when the vnode reference count for the vnode
1590  * goes to zero.  If the file has been unlinked, then it must
1591  * now be truncated.  Also, we clear all of the read-ahead state
1592  * kept for the inode here since the file is now closed.
1593  */
1594 STATIC int
1595 xfs_inactive(
1596         bhv_desc_t      *bdp,
1597         cred_t          *credp)
1598 {
1599         xfs_inode_t     *ip;
1600         vnode_t         *vp;
1601         xfs_bmap_free_t free_list;
1602         xfs_fsblock_t   first_block;
1603         int             committed;
1604         xfs_trans_t     *tp;
1605         xfs_mount_t     *mp;
1606         int             error;
1607         int             truncate;
1608
1609         vp = BHV_TO_VNODE(bdp);
1610         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1611
1612         ip = XFS_BHVTOI(bdp);
1613
1614         /*
1615          * If the inode is already free, then there can be nothing
1616          * to clean up here.
1617          */
1618         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1619                 ASSERT(ip->i_df.if_real_bytes == 0);
1620                 ASSERT(ip->i_df.if_broot_bytes == 0);
1621                 return VN_INACTIVE_CACHE;
1622         }
1623
1624         /*
1625          * Only do a truncate if it's a regular file with
1626          * some actual space in it.  It's OK to look at the
1627          * inode's fields without the lock because we're the
1628          * only one with a reference to the inode.
1629          */
1630         truncate = ((ip->i_d.di_nlink == 0) &&
1631             ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) &&
1632             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1633
1634         mp = ip->i_mount;
1635
1636         if (ip->i_d.di_nlink == 0 &&
1637             DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1638                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1639         }
1640
1641         error = 0;
1642
1643         /* If this is a read-only mount, don't do this (would generate I/O) */
1644         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1645                 goto out;
1646
1647         if (ip->i_d.di_nlink != 0) {
1648                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1649                      ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
1650                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1651                     (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) ||
1652                      (ip->i_delayed_blks != 0))) {
1653                         if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1654                                 return (VN_INACTIVE_CACHE);
1655                         /* Update linux inode block count after free above */
1656                         LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1657                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1658                 }
1659                 goto out;
1660         }
1661
1662         ASSERT(ip->i_d.di_nlink == 0);
1663
1664         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1665                 return (VN_INACTIVE_CACHE);
1666
1667         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1668         if (truncate) {
1669                 /*
1670                  * Do the xfs_itruncate_start() call before
1671                  * reserving any log space because itruncate_start
1672                  * will call into the buffer cache and we can't
1673                  * do that within a transaction.
1674                  */
1675                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1676
1677                 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1678
1679                 error = xfs_trans_reserve(tp, 0,
1680                                           XFS_ITRUNCATE_LOG_RES(mp),
1681                                           0, XFS_TRANS_PERM_LOG_RES,
1682                                           XFS_ITRUNCATE_LOG_COUNT);
1683                 if (error) {
1684                         /* Don't call itruncate_cleanup */
1685                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1686                         xfs_trans_cancel(tp, 0);
1687                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1688                         return (VN_INACTIVE_CACHE);
1689                 }
1690
1691                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1692                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1693                 xfs_trans_ihold(tp, ip);
1694
1695                 /*
1696                  * normally, we have to run xfs_itruncate_finish sync.
1697                  * But if filesystem is wsync and we're in the inactive
1698                  * path, then we know that nlink == 0, and that the
1699                  * xaction that made nlink == 0 is permanently committed
1700                  * since xfs_remove runs as a synchronous transaction.
1701                  */
1702                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1703                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1704
1705                 if (error) {
1706                         xfs_trans_cancel(tp,
1707                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1708                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1709                         return (VN_INACTIVE_CACHE);
1710                 }
1711         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1712
1713                 /*
1714                  * If we get an error while cleaning up a
1715                  * symlink we bail out.
1716                  */
1717                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1718                         xfs_inactive_symlink_rmt(ip, &tp) :
1719                         xfs_inactive_symlink_local(ip, &tp);
1720
1721                 if (error) {
1722                         ASSERT(tp == NULL);
1723                         return (VN_INACTIVE_CACHE);
1724                 }
1725
1726                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1727                 xfs_trans_ihold(tp, ip);
1728         } else {
1729                 error = xfs_trans_reserve(tp, 0,
1730                                           XFS_IFREE_LOG_RES(mp),
1731                                           0, XFS_TRANS_PERM_LOG_RES,
1732                                           XFS_INACTIVE_LOG_COUNT);
1733                 if (error) {
1734                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1735                         xfs_trans_cancel(tp, 0);
1736                         return (VN_INACTIVE_CACHE);
1737                 }
1738
1739                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1740                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1741                 xfs_trans_ihold(tp, ip);
1742         }
1743
1744         /*
1745          * If there are attributes associated with the file
1746          * then blow them away now.  The code calls a routine
1747          * that recursively deconstructs the attribute fork.
1748          * We need to just commit the current transaction
1749          * because we can't use it for xfs_attr_inactive().
1750          */
1751         if (ip->i_d.di_anextents > 0) {
1752                 error = xfs_inactive_attrs(ip, &tp);
1753                 /*
1754                  * If we got an error, the transaction is already
1755                  * cancelled, and the inode is unlocked. Just get out.
1756                  */
1757                  if (error)
1758                          return (VN_INACTIVE_CACHE);
1759         } else if (ip->i_afp) {
1760                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1761         }
1762
1763         /*
1764          * Free the inode.
1765          */
1766         XFS_BMAP_INIT(&free_list, &first_block);
1767         error = xfs_ifree(tp, ip, &free_list);
1768         if (error) {
1769                 /*
1770                  * If we fail to free the inode, shut down.  The cancel
1771                  * might do that, we need to make sure.  Otherwise the
1772                  * inode might be lost for a long time or forever.
1773                  */
1774                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1775                         cmn_err(CE_NOTE,
1776                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1777                                 error, mp->m_fsname);
1778                         xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
1779                 }
1780                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1781         } else {
1782                 /*
1783                  * Credit the quota account(s). The inode is gone.
1784                  */
1785                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1786
1787                 /*
1788                  * Just ignore errors at this point.  There is
1789                  * nothing we can do except to try to keep going.
1790                  */
1791                 (void) xfs_bmap_finish(&tp,  &free_list, first_block,
1792                                        &committed);
1793                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1794         }
1795         /*
1796          * Release the dquots held by inode, if any.
1797          */
1798         XFS_QM_DQDETACH(mp, ip);
1799
1800         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1801
1802  out:
1803         return VN_INACTIVE_CACHE;
1804 }
1805
1806
1807 /*
1808  * xfs_lookup
1809  */
1810 STATIC int
1811 xfs_lookup(
1812         bhv_desc_t              *dir_bdp,
1813         vname_t                 *dentry,
1814         vnode_t                 **vpp,
1815         int                     flags,
1816         vnode_t                 *rdir,
1817         cred_t                  *credp)
1818 {
1819         xfs_inode_t             *dp, *ip;
1820         xfs_ino_t               e_inum;
1821         int                     error;
1822         uint                    lock_mode;
1823         vnode_t                 *dir_vp;
1824
1825         dir_vp = BHV_TO_VNODE(dir_bdp);
1826         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1827
1828         dp = XFS_BHVTOI(dir_bdp);
1829
1830         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1831                 return XFS_ERROR(EIO);
1832
1833         lock_mode = xfs_ilock_map_shared(dp);
1834         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1835         if (!error) {
1836                 *vpp = XFS_ITOV(ip);
1837                 ITRACE(ip);
1838         }
1839         xfs_iunlock_map_shared(dp, lock_mode);
1840         return error;
1841 }
1842
1843
1844 /*
1845  * xfs_create (create a new file).
1846  */
1847 STATIC int
1848 xfs_create(
1849         bhv_desc_t              *dir_bdp,
1850         vname_t                 *dentry,
1851         vattr_t                 *vap,
1852         vnode_t                 **vpp,
1853         cred_t                  *credp)
1854 {
1855         char                    *name = VNAME(dentry);
1856         vnode_t                 *dir_vp;
1857         xfs_inode_t             *dp, *ip;
1858         vnode_t                 *vp=NULL;
1859         xfs_trans_t             *tp;
1860         xfs_mount_t             *mp;
1861         xfs_dev_t               rdev;
1862         int                     error;
1863         xfs_bmap_free_t         free_list;
1864         xfs_fsblock_t           first_block;
1865         boolean_t               dp_joined_to_trans;
1866         int                     dm_event_sent = 0;
1867         uint                    cancel_flags;
1868         int                     committed;
1869         xfs_prid_t              prid;
1870         struct xfs_dquot        *udqp, *gdqp;
1871         uint                    resblks;
1872         int                     dm_di_mode;
1873         int                     namelen;
1874
1875         ASSERT(!*vpp);
1876         dir_vp = BHV_TO_VNODE(dir_bdp);
1877         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1878
1879         dp = XFS_BHVTOI(dir_bdp);
1880         mp = dp->i_mount;
1881
1882         dm_di_mode = vap->va_mode;
1883         namelen = VNAMELEN(dentry);
1884
1885         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1886                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1887                                 dir_vp, DM_RIGHT_NULL, NULL,
1888                                 DM_RIGHT_NULL, name, NULL,
1889                                 dm_di_mode, 0, 0);
1890
1891                 if (error)
1892                         return error;
1893                 dm_event_sent = 1;
1894         }
1895
1896         if (XFS_FORCED_SHUTDOWN(mp))
1897                 return XFS_ERROR(EIO);
1898
1899         /* Return through std_return after this point. */
1900
1901         udqp = gdqp = NULL;
1902         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1903                 prid = dp->i_d.di_projid;
1904         else if (vap->va_mask & XFS_AT_PROJID)
1905                 prid = (xfs_prid_t)vap->va_projid;
1906         else
1907                 prid = (xfs_prid_t)dfltprid;
1908
1909         /*
1910          * Make sure that we have allocated dquot(s) on disk.
1911          */
1912         error = XFS_QM_DQVOPALLOC(mp, dp,
1913                         current_fsuid(credp), current_fsgid(credp), prid,
1914                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1915         if (error)
1916                 goto std_return;
1917
1918         ip = NULL;
1919         dp_joined_to_trans = B_FALSE;
1920
1921         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1922         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1923         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1924         /*
1925          * Initially assume that the file does not exist and
1926          * reserve the resources for that case.  If that is not
1927          * the case we'll drop the one we have and get a more
1928          * appropriate transaction later.
1929          */
1930         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1931                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1932         if (error == ENOSPC) {
1933                 resblks = 0;
1934                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1935                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1936         }
1937         if (error) {
1938                 cancel_flags = 0;
1939                 dp = NULL;
1940                 goto error_return;
1941         }
1942
1943         xfs_ilock(dp, XFS_ILOCK_EXCL);
1944
1945         XFS_BMAP_INIT(&free_list, &first_block);
1946
1947         ASSERT(ip == NULL);
1948
1949         /*
1950          * Reserve disk quota and the inode.
1951          */
1952         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1953         if (error)
1954                 goto error_return;
1955
1956         if (resblks == 0 &&
1957             (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
1958                 goto error_return;
1959         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1960         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1961                         rdev, credp, prid, resblks > 0,
1962                         &ip, &committed);
1963         if (error) {
1964                 if (error == ENOSPC)
1965                         goto error_return;
1966                 goto abort_return;
1967         }
1968         ITRACE(ip);
1969
1970         /*
1971          * At this point, we've gotten a newly allocated inode.
1972          * It is locked (and joined to the transaction).
1973          */
1974
1975         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1976
1977         /*
1978          * Now we join the directory inode to the transaction.
1979          * We do not do it earlier because xfs_dir_ialloc
1980          * might commit the previous transaction (and release
1981          * all the locks).
1982          */
1983
1984         VN_HOLD(dir_vp);
1985         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1986         dp_joined_to_trans = B_TRUE;
1987
1988         error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
1989                 &first_block, &free_list,
1990                 resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1991         if (error) {
1992                 ASSERT(error != ENOSPC);
1993                 goto abort_return;
1994         }
1995         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1996         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1997
1998         /*
1999          * If this is a synchronous mount, make sure that the
2000          * create transaction goes to disk before returning to
2001          * the user.
2002          */
2003         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2004                 xfs_trans_set_sync(tp);
2005         }
2006
2007         dp->i_gen++;
2008
2009         /*
2010          * Attach the dquot(s) to the inodes and modify them incore.
2011          * These ids of the inode couldn't have changed since the new
2012          * inode has been locked ever since it was created.
2013          */
2014         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2015
2016         /*
2017          * xfs_trans_commit normally decrements the vnode ref count
2018          * when it unlocks the inode. Since we want to return the
2019          * vnode to the caller, we bump the vnode ref count now.
2020          */
2021         IHOLD(ip);
2022         vp = XFS_ITOV(ip);
2023
2024         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2025         if (error) {
2026                 xfs_bmap_cancel(&free_list);
2027                 goto abort_rele;
2028         }
2029
2030         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2031         if (error) {
2032                 IRELE(ip);
2033                 tp = NULL;
2034                 goto error_return;
2035         }
2036
2037         XFS_QM_DQRELE(mp, udqp);
2038         XFS_QM_DQRELE(mp, gdqp);
2039
2040         /*
2041          * Propogate the fact that the vnode changed after the
2042          * xfs_inode locks have been released.
2043          */
2044         VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2045
2046         *vpp = vp;
2047
2048         /* Fallthrough to std_return with error = 0  */
2049
2050 std_return:
2051         if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2052                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2053                                                         DM_EVENT_POSTCREATE)) {
2054                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2055                         dir_vp, DM_RIGHT_NULL,
2056                         *vpp ? vp:NULL,
2057                         DM_RIGHT_NULL, name, NULL,
2058                         dm_di_mode, error, 0);
2059         }
2060         return error;
2061
2062  abort_return:
2063         cancel_flags |= XFS_TRANS_ABORT;
2064         /* FALLTHROUGH */
2065  error_return:
2066
2067         if (tp != NULL)
2068                 xfs_trans_cancel(tp, cancel_flags);
2069
2070         if (!dp_joined_to_trans && (dp != NULL))
2071                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2072         XFS_QM_DQRELE(mp, udqp);
2073         XFS_QM_DQRELE(mp, gdqp);
2074
2075         goto std_return;
2076
2077  abort_rele:
2078         /*
2079          * Wait until after the current transaction is aborted to
2080          * release the inode.  This prevents recursive transactions
2081          * and deadlocks from xfs_inactive.
2082          */
2083         cancel_flags |= XFS_TRANS_ABORT;
2084         xfs_trans_cancel(tp, cancel_flags);
2085         IRELE(ip);
2086
2087         XFS_QM_DQRELE(mp, udqp);
2088         XFS_QM_DQRELE(mp, gdqp);
2089
2090         goto std_return;
2091 }
2092
2093 #ifdef DEBUG
2094 /*
2095  * Some counters to see if (and how often) we are hitting some deadlock
2096  * prevention code paths.
2097  */
2098
2099 int xfs_rm_locks;
2100 int xfs_rm_lock_delays;
2101 int xfs_rm_attempts;
2102 #endif
2103
2104 /*
2105  * The following routine will lock the inodes associated with the
2106  * directory and the named entry in the directory. The locks are
2107  * acquired in increasing inode number.
2108  *
2109  * If the entry is "..", then only the directory is locked. The
2110  * vnode ref count will still include that from the .. entry in
2111  * this case.
2112  *
2113  * There is a deadlock we need to worry about. If the locked directory is
2114  * in the AIL, it might be blocking up the log. The next inode we lock
2115  * could be already locked by another thread waiting for log space (e.g
2116  * a permanent log reservation with a long running transaction (see
2117  * xfs_itruncate_finish)). To solve this, we must check if the directory
2118  * is in the ail and use lock_nowait. If we can't lock, we need to
2119  * drop the inode lock on the directory and try again. xfs_iunlock will
2120  * potentially push the tail if we were holding up the log.
2121  */
2122 STATIC int
2123 xfs_lock_dir_and_entry(
2124         xfs_inode_t     *dp,
2125         vname_t         *dentry,
2126         xfs_inode_t     *ip)    /* inode of entry 'name' */
2127 {
2128         int             attempts;
2129         xfs_ino_t       e_inum;
2130         xfs_inode_t     *ips[2];
2131         xfs_log_item_t  *lp;
2132
2133 #ifdef DEBUG
2134         xfs_rm_locks++;
2135 #endif
2136         attempts = 0;
2137
2138 again:
2139         xfs_ilock(dp, XFS_ILOCK_EXCL);
2140
2141         e_inum = ip->i_ino;
2142
2143         ITRACE(ip);
2144
2145         /*
2146          * We want to lock in increasing inum. Since we've already
2147          * acquired the lock on the directory, we may need to release
2148          * if if the inum of the entry turns out to be less.
2149          */
2150         if (e_inum > dp->i_ino) {
2151                 /*
2152                  * We are already in the right order, so just
2153                  * lock on the inode of the entry.
2154                  * We need to use nowait if dp is in the AIL.
2155                  */
2156
2157                 lp = (xfs_log_item_t *)dp->i_itemp;
2158                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2159                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2160                                 attempts++;
2161 #ifdef DEBUG
2162                                 xfs_rm_attempts++;
2163 #endif
2164
2165                                 /*
2166                                  * Unlock dp and try again.
2167                                  * xfs_iunlock will try to push the tail
2168                                  * if the inode is in the AIL.
2169                                  */
2170
2171                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2172
2173                                 if ((attempts % 5) == 0) {
2174                                         delay(1); /* Don't just spin the CPU */
2175 #ifdef DEBUG
2176                                         xfs_rm_lock_delays++;
2177 #endif
2178                                 }
2179                                 goto again;
2180                         }
2181                 } else {
2182                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2183                 }
2184         } else if (e_inum < dp->i_ino) {
2185                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2186
2187                 ips[0] = ip;
2188                 ips[1] = dp;
2189                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2190         }
2191         /* else  e_inum == dp->i_ino */
2192         /*     This can happen if we're asked to lock /x/..
2193          *     the entry is "..", which is also the parent directory.
2194          */
2195
2196         return 0;
2197 }
2198
2199 #ifdef DEBUG
2200 int xfs_locked_n;
2201 int xfs_small_retries;
2202 int xfs_middle_retries;
2203 int xfs_lots_retries;
2204 int xfs_lock_delays;
2205 #endif
2206
2207 /*
2208  * The following routine will lock n inodes in exclusive mode.
2209  * We assume the caller calls us with the inodes in i_ino order.
2210  *
2211  * We need to detect deadlock where an inode that we lock
2212  * is in the AIL and we start waiting for another inode that is locked
2213  * by a thread in a long running transaction (such as truncate). This can
2214  * result in deadlock since the long running trans might need to wait
2215  * for the inode we just locked in order to push the tail and free space
2216  * in the log.
2217  */
2218 void
2219 xfs_lock_inodes(
2220         xfs_inode_t     **ips,
2221         int             inodes,
2222         int             first_locked,
2223         uint            lock_mode)
2224 {
2225         int             attempts = 0, i, j, try_lock;
2226         xfs_log_item_t  *lp;
2227
2228         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2229
2230         if (first_locked) {
2231                 try_lock = 1;
2232                 i = 1;
2233         } else {
2234                 try_lock = 0;
2235                 i = 0;
2236         }
2237
2238 again:
2239         for (; i < inodes; i++) {
2240                 ASSERT(ips[i]);
2241
2242                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2243                         continue;
2244
2245                 /*
2246                  * If try_lock is not set yet, make sure all locked inodes
2247                  * are not in the AIL.
2248                  * If any are, set try_lock to be used later.
2249                  */
2250
2251                 if (!try_lock) {
2252                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2253                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2254                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2255                                         try_lock++;
2256                                 }
2257                         }
2258                 }
2259
2260                 /*
2261                  * If any of the previous locks we have locked is in the AIL,
2262                  * we must TRY to get the second and subsequent locks. If
2263                  * we can't get any, we must release all we have
2264                  * and try again.
2265                  */
2266
2267                 if (try_lock) {
2268                         /* try_lock must be 0 if i is 0. */
2269                         /*
2270                          * try_lock means we have an inode locked
2271                          * that is in the AIL.
2272                          */
2273                         ASSERT(i != 0);
2274                         if (!xfs_ilock_nowait(ips[i], lock_mode)) {
2275                                 attempts++;
2276
2277                                 /*
2278                                  * Unlock all previous guys and try again.
2279                                  * xfs_iunlock will try to push the tail
2280                                  * if the inode is in the AIL.
2281                                  */
2282
2283                                 for(j = i - 1; j >= 0; j--) {
2284
2285                                         /*
2286                                          * Check to see if we've already
2287                                          * unlocked this one.
2288                                          * Not the first one going back,
2289                                          * and the inode ptr is the same.
2290                                          */
2291                                         if ((j != (i - 1)) && ips[j] ==
2292                                                                 ips[j+1])
2293                                                 continue;
2294
2295                                         xfs_iunlock(ips[j], lock_mode);
2296                                 }
2297
2298                                 if ((attempts % 5) == 0) {
2299                                         delay(1); /* Don't just spin the CPU */
2300 #ifdef DEBUG
2301                                         xfs_lock_delays++;
2302 #endif
2303                                 }
2304                                 i = 0;
2305                                 try_lock = 0;
2306                                 goto again;
2307                         }
2308                 } else {
2309                         xfs_ilock(ips[i], lock_mode);
2310                 }
2311         }
2312
2313 #ifdef DEBUG
2314         if (attempts) {
2315                 if (attempts < 5) xfs_small_retries++;
2316                 else if (attempts < 100) xfs_middle_retries++;
2317                 else xfs_lots_retries++;
2318         } else {
2319                 xfs_locked_n++;
2320         }
2321 #endif
2322 }
2323
2324 #ifdef  DEBUG
2325 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2326 int remove_which_error_return = 0;
2327 #else /* ! DEBUG */
2328 #define REMOVE_DEBUG_TRACE(x)
2329 #endif  /* ! DEBUG */
2330
2331
2332 /*
2333  * xfs_remove
2334  *
2335  */
2336 STATIC int
2337 xfs_remove(
2338         bhv_desc_t              *dir_bdp,
2339         vname_t                 *dentry,
2340         cred_t                  *credp)
2341 {
2342         vnode_t                 *dir_vp;
2343         char                    *name = VNAME(dentry);
2344         xfs_inode_t             *dp, *ip;
2345         xfs_trans_t             *tp = NULL;
2346         xfs_mount_t             *mp;
2347         int                     error = 0;
2348         xfs_bmap_free_t         free_list;
2349         xfs_fsblock_t           first_block;
2350         int                     cancel_flags;
2351         int                     committed;
2352         int                     dm_di_mode = 0;
2353         int                     link_zero;
2354         uint                    resblks;
2355         int                     namelen;
2356
2357         dir_vp = BHV_TO_VNODE(dir_bdp);
2358         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2359
2360         dp = XFS_BHVTOI(dir_bdp);
2361         mp = dp->i_mount;
2362
2363         if (XFS_FORCED_SHUTDOWN(mp))
2364                 return XFS_ERROR(EIO);
2365
2366         namelen = VNAMELEN(dentry);
2367
2368         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2369                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2370                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2371                                         name, NULL, 0, 0, 0);
2372                 if (error)
2373                         return error;
2374         }
2375
2376         /* From this point on, return through std_return */
2377         ip = NULL;
2378
2379         /*
2380          * We need to get a reference to ip before we get our log
2381          * reservation. The reason for this is that we cannot call
2382          * xfs_iget for an inode for which we do not have a reference
2383          * once we've acquired a log reservation. This is because the
2384          * inode we are trying to get might be in xfs_inactive going
2385          * for a log reservation. Since we'll have to wait for the
2386          * inactive code to complete before returning from xfs_iget,
2387          * we need to make sure that we don't have log space reserved
2388          * when we call xfs_iget.  Instead we get an unlocked referece
2389          * to the inode before getting our log reservation.
2390          */
2391         error = xfs_get_dir_entry(dentry, &ip);
2392         if (error) {
2393                 REMOVE_DEBUG_TRACE(__LINE__);
2394                 goto std_return;
2395         }
2396
2397         dm_di_mode = ip->i_d.di_mode;
2398
2399         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2400
2401         ITRACE(ip);
2402
2403         error = XFS_QM_DQATTACH(mp, dp, 0);
2404         if (!error && dp != ip)
2405                 error = XFS_QM_DQATTACH(mp, ip, 0);
2406         if (error) {
2407                 REMOVE_DEBUG_TRACE(__LINE__);
2408                 IRELE(ip);
2409                 goto std_return;
2410         }
2411
2412         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2413         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2414         /*
2415          * We try to get the real space reservation first,
2416          * allowing for directory btree deletion(s) implying
2417          * possible bmap insert(s).  If we can't get the space
2418          * reservation then we use 0 instead, and avoid the bmap
2419          * btree insert(s) in the directory code by, if the bmap
2420          * insert tries to happen, instead trimming the LAST
2421          * block from the directory.
2422          */
2423         resblks = XFS_REMOVE_SPACE_RES(mp);
2424         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2425                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2426         if (error == ENOSPC) {
2427                 resblks = 0;
2428                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2429                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2430         }
2431         if (error) {
2432                 ASSERT(error != ENOSPC);
2433                 REMOVE_DEBUG_TRACE(__LINE__);
2434                 xfs_trans_cancel(tp, 0);
2435                 IRELE(ip);
2436                 return error;
2437         }
2438
2439         error = xfs_lock_dir_and_entry(dp, dentry, ip);
2440         if (error) {
2441                 REMOVE_DEBUG_TRACE(__LINE__);
2442                 xfs_trans_cancel(tp, cancel_flags);
2443                 IRELE(ip);
2444                 goto std_return;
2445         }
2446
2447         /*
2448          * At this point, we've gotten both the directory and the entry
2449          * inodes locked.
2450          */
2451         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2452         if (dp != ip) {
2453                 /*
2454                  * Increment vnode ref count only in this case since
2455                  * there's an extra vnode reference in the case where
2456                  * dp == ip.
2457                  */
2458                 IHOLD(dp);
2459                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2460         }
2461
2462         /*
2463          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2464          */
2465         XFS_BMAP_INIT(&free_list, &first_block);
2466         error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
2467                 &first_block, &free_list, 0);
2468         if (error) {
2469                 ASSERT(error != ENOENT);
2470                 REMOVE_DEBUG_TRACE(__LINE__);
2471                 goto error1;
2472         }
2473         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2474
2475         dp->i_gen++;
2476         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2477
2478         error = xfs_droplink(tp, ip);
2479         if (error) {
2480                 REMOVE_DEBUG_TRACE(__LINE__);
2481                 goto error1;
2482         }
2483
2484         /* Determine if this is the last link while
2485          * we are in the transaction.
2486          */
2487         link_zero = (ip)->i_d.di_nlink==0;
2488
2489         /*
2490          * Take an extra ref on the inode so that it doesn't
2491          * go to xfs_inactive() from within the commit.
2492          */
2493         IHOLD(ip);
2494
2495         /*
2496          * If this is a synchronous mount, make sure that the
2497          * remove transaction goes to disk before returning to
2498          * the user.
2499          */
2500         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2501                 xfs_trans_set_sync(tp);
2502         }
2503
2504         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2505         if (error) {
2506                 REMOVE_DEBUG_TRACE(__LINE__);
2507                 goto error_rele;
2508         }
2509
2510         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2511         if (error) {
2512                 IRELE(ip);
2513                 goto std_return;
2514         }
2515
2516         /*
2517          * Before we drop our extra reference to the inode, purge it
2518          * from the refcache if it is there.  By waiting until afterwards
2519          * to do the IRELE, we ensure that we won't go inactive in the
2520          * xfs_refcache_purge_ip routine (although that would be OK).
2521          */
2522         xfs_refcache_purge_ip(ip);
2523
2524         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2525
2526         /*
2527          * Let interposed file systems know about removed links.
2528          */
2529         VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
2530
2531         IRELE(ip);
2532
2533 /*      Fall through to std_return with error = 0 */
2534  std_return:
2535         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2536                                                 DM_EVENT_POSTREMOVE)) {
2537                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2538                                 dir_vp, DM_RIGHT_NULL,
2539                                 NULL, DM_RIGHT_NULL,
2540                                 name, NULL, dm_di_mode, error, 0);
2541         }
2542         return error;
2543
2544  error1:
2545         xfs_bmap_cancel(&free_list);
2546         cancel_flags |= XFS_TRANS_ABORT;
2547         xfs_trans_cancel(tp, cancel_flags);
2548         goto std_return;
2549
2550  error_rele:
2551         /*
2552          * In this case make sure to not release the inode until after
2553          * the current transaction is aborted.  Releasing it beforehand
2554          * can cause us to go to xfs_inactive and start a recursive
2555          * transaction which can easily deadlock with the current one.
2556          */
2557         xfs_bmap_cancel(&free_list);
2558         cancel_flags |= XFS_TRANS_ABORT;
2559         xfs_trans_cancel(tp, cancel_flags);
2560
2561         /*
2562          * Before we drop our extra reference to the inode, purge it
2563          * from the refcache if it is there.  By waiting until afterwards
2564          * to do the IRELE, we ensure that we won't go inactive in the
2565          * xfs_refcache_purge_ip routine (although that would be OK).
2566          */
2567         xfs_refcache_purge_ip(ip);
2568
2569         IRELE(ip);
2570
2571         goto std_return;
2572 }
2573
2574
2575 /*
2576  * xfs_link
2577  *
2578  */
2579 STATIC int
2580 xfs_link(
2581         bhv_desc_t              *target_dir_bdp,
2582         vnode_t                 *src_vp,
2583         vname_t                 *dentry,
2584         cred_t                  *credp)
2585 {
2586         xfs_inode_t             *tdp, *sip;
2587         xfs_trans_t             *tp;
2588         xfs_mount_t             *mp;
2589         xfs_inode_t             *ips[2];
2590         int                     error;
2591         xfs_bmap_free_t         free_list;
2592         xfs_fsblock_t           first_block;
2593         int                     cancel_flags;
2594         int                     committed;
2595         vnode_t                 *target_dir_vp;
2596         bhv_desc_t              *src_bdp;
2597         int                     resblks;
2598         char                    *target_name = VNAME(dentry);
2599         int                     target_namelen;
2600
2601         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2602         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2603         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2604
2605         target_namelen = VNAMELEN(dentry);
2606         if (VN_ISDIR(src_vp))
2607                 return XFS_ERROR(EPERM);
2608
2609         src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
2610         sip = XFS_BHVTOI(src_bdp);
2611         tdp = XFS_BHVTOI(target_dir_bdp);
2612         mp = tdp->i_mount;
2613         if (XFS_FORCED_SHUTDOWN(mp))
2614                 return XFS_ERROR(EIO);
2615
2616         if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2617                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2618                                         target_dir_vp, DM_RIGHT_NULL,
2619                                         src_vp, DM_RIGHT_NULL,
2620                                         target_name, NULL, 0, 0, 0);
2621                 if (error)
2622                         return error;
2623         }
2624
2625         /* Return through std_return after this point. */
2626
2627         error = XFS_QM_DQATTACH(mp, sip, 0);
2628         if (!error && sip != tdp)
2629                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2630         if (error)
2631                 goto std_return;
2632
2633         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2634         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2635         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2636         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2637                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2638         if (error == ENOSPC) {
2639                 resblks = 0;
2640                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2641                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2642         }
2643         if (error) {
2644                 cancel_flags = 0;
2645                 goto error_return;
2646         }
2647
2648         if (sip->i_ino < tdp->i_ino) {
2649                 ips[0] = sip;
2650                 ips[1] = tdp;
2651         } else {
2652                 ips[0] = tdp;
2653                 ips[1] = sip;
2654         }
2655
2656         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2657
2658         /*
2659          * Increment vnode ref counts since xfs_trans_commit &
2660          * xfs_trans_cancel will both unlock the inodes and
2661          * decrement the associated ref counts.
2662          */
2663         VN_HOLD(src_vp);
2664         VN_HOLD(target_dir_vp);
2665         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2666         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2667
2668         /*
2669          * If the source has too many links, we can't make any more to it.
2670          */
2671         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2672                 error = XFS_ERROR(EMLINK);
2673                 goto error_return;
2674         }
2675
2676         /*
2677          * If we are using project inheritance, we only allow hard link
2678          * creation in our tree when the project IDs are the same; else
2679          * the tree quota mechanism could be circumvented.
2680          */
2681         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2682                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2683                 error = XFS_ERROR(EPERM);
2684                 goto error_return;
2685         }
2686
2687         if (resblks == 0 &&
2688             (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
2689                         target_namelen)))
2690                 goto error_return;
2691
2692         XFS_BMAP_INIT(&free_list, &first_block);
2693
2694         error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
2695                                    sip->i_ino, &first_block, &free_list,
2696                                    resblks);
2697         if (error)
2698                 goto abort_return;
2699         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2700         tdp->i_gen++;
2701         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2702
2703         error = xfs_bumplink(tp, sip);
2704         if (error) {
2705                 goto abort_return;
2706         }
2707
2708         /*
2709          * If this is a synchronous mount, make sure that the
2710          * link transaction goes to disk before returning to
2711          * the user.
2712          */
2713         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2714                 xfs_trans_set_sync(tp);
2715         }
2716
2717         error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
2718         if (error) {
2719                 xfs_bmap_cancel(&free_list);
2720                 goto abort_return;
2721         }
2722
2723         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2724         if (error) {
2725                 goto std_return;
2726         }
2727
2728         /* Fall through to std_return with error = 0. */
2729 std_return:
2730         if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2731                                                 DM_EVENT_POSTLINK)) {
2732                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2733                                 target_dir_vp, DM_RIGHT_NULL,
2734                                 src_vp, DM_RIGHT_NULL,
2735                                 target_name, NULL, 0, error, 0);
2736         }
2737         return error;
2738
2739  abort_return:
2740         cancel_flags |= XFS_TRANS_ABORT;
2741         /* FALLTHROUGH */
2742  error_return:
2743         xfs_trans_cancel(tp, cancel_flags);
2744
2745         goto std_return;
2746 }
2747 /*
2748  * xfs_mkdir
2749  *
2750  */
2751 STATIC int
2752 xfs_mkdir(
2753         bhv_desc_t              *dir_bdp,
2754         vname_t                 *dentry,
2755         vattr_t                 *vap,
2756         vnode_t                 **vpp,
2757         cred_t                  *credp)
2758 {
2759         char                    *dir_name = VNAME(dentry);
2760         xfs_inode_t             *dp;
2761         xfs_inode_t             *cdp;   /* inode of created dir */
2762         vnode_t                 *cvp;   /* vnode of created dir */
2763         xfs_trans_t             *tp;
2764         xfs_mount_t             *mp;
2765         int                     cancel_flags;
2766         int                     error;
2767         int                     committed;
2768         xfs_bmap_free_t         free_list;
2769         xfs_fsblock_t           first_block;
2770         vnode_t                 *dir_vp;
2771         boolean_t               dp_joined_to_trans;
2772         boolean_t               created = B_FALSE;
2773         int                     dm_event_sent = 0;
2774         xfs_prid_t              prid;
2775         struct xfs_dquot        *udqp, *gdqp;
2776         uint                    resblks;
2777         int                     dm_di_mode;
2778         int                     dir_namelen;
2779
2780         dir_vp = BHV_TO_VNODE(dir_bdp);
2781         dp = XFS_BHVTOI(dir_bdp);
2782         mp = dp->i_mount;
2783
2784         if (XFS_FORCED_SHUTDOWN(mp))
2785                 return XFS_ERROR(EIO);
2786
2787         dir_namelen = VNAMELEN(dentry);
2788
2789         tp = NULL;
2790         dp_joined_to_trans = B_FALSE;
2791         dm_di_mode = vap->va_mode;
2792
2793         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2794                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2795                                         dir_vp, DM_RIGHT_NULL, NULL,
2796                                         DM_RIGHT_NULL, dir_name, NULL,
2797                                         dm_di_mode, 0, 0);
2798                 if (error)
2799                         return error;
2800                 dm_event_sent = 1;
2801         }
2802
2803         /* Return through std_return after this point. */
2804
2805         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2806
2807         mp = dp->i_mount;
2808         udqp = gdqp = NULL;
2809         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2810                 prid = dp->i_d.di_projid;
2811         else if (vap->va_mask & XFS_AT_PROJID)
2812                 prid = (xfs_prid_t)vap->va_projid;
2813         else
2814                 prid = (xfs_prid_t)dfltprid;
2815
2816         /*
2817          * Make sure that we have allocated dquot(s) on disk.
2818          */
2819         error = XFS_QM_DQVOPALLOC(mp, dp,
2820                         current_fsuid(credp), current_fsgid(credp), prid,
2821                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2822         if (error)
2823                 goto std_return;
2824
2825         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2826         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2827         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2828         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2829                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2830         if (error == ENOSPC) {
2831                 resblks = 0;
2832                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2833                                           XFS_TRANS_PERM_LOG_RES,
2834                                           XFS_MKDIR_LOG_COUNT);
2835         }
2836         if (error) {
2837                 cancel_flags = 0;
2838                 dp = NULL;
2839                 goto error_return;
2840         }
2841
2842         xfs_ilock(dp, XFS_ILOCK_EXCL);
2843
2844         /*
2845          * Check for directory link count overflow.
2846          */
2847         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2848                 error = XFS_ERROR(EMLINK);
2849                 goto error_return;
2850         }
2851
2852         /*
2853          * Reserve disk quota and the inode.
2854          */
2855         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2856         if (error)
2857                 goto error_return;
2858
2859         if (resblks == 0 &&
2860             (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
2861                 goto error_return;
2862         /*
2863          * create the directory inode.
2864          */
2865         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2866                         0, credp, prid, resblks > 0,
2867                 &cdp, NULL);
2868         if (error) {
2869                 if (error == ENOSPC)
2870                         goto error_return;
2871                 goto abort_return;
2872         }
2873         ITRACE(cdp);
2874
2875         /*
2876          * Now we add the directory inode to the transaction.
2877          * We waited until now since xfs_dir_ialloc might start
2878          * a new transaction.  Had we joined the transaction
2879          * earlier, the locks might have gotten released.
2880          */
2881         VN_HOLD(dir_vp);
2882         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2883         dp_joined_to_trans = B_TRUE;
2884
2885         XFS_BMAP_INIT(&free_list, &first_block);
2886
2887         error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
2888                         cdp->i_ino, &first_block, &free_list,
2889                         resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2890         if (error) {
2891                 ASSERT(error != ENOSPC);
2892                 goto error1;
2893         }
2894         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2895
2896         /*
2897          * Bump the in memory version number of the parent directory
2898          * so that other processes accessing it will recognize that
2899          * the directory has changed.
2900          */
2901         dp->i_gen++;
2902
2903         error = XFS_DIR_INIT(mp, tp, cdp, dp);
2904         if (error) {
2905                 goto error2;
2906         }
2907
2908         cdp->i_gen = 1;
2909         error = xfs_bumplink(tp, dp);
2910         if (error) {
2911                 goto error2;
2912         }
2913
2914         cvp = XFS_ITOV(cdp);
2915
2916         created = B_TRUE;
2917
2918         *vpp = cvp;
2919         IHOLD(cdp);
2920
2921         /*
2922          * Attach the dquots to the new inode and modify the icount incore.
2923          */
2924         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2925
2926         /*
2927          * If this is a synchronous mount, make sure that the
2928          * mkdir transaction goes to disk before returning to
2929          * the user.
2930          */
2931         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2932                 xfs_trans_set_sync(tp);
2933         }
2934
2935         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2936         if (error) {
2937                 IRELE(cdp);
2938                 goto error2;
2939         }
2940
2941         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2942         XFS_QM_DQRELE(mp, udqp);
2943         XFS_QM_DQRELE(mp, gdqp);
2944         if (error) {
2945                 IRELE(cdp);
2946         }
2947
2948         /* Fall through to std_return with error = 0 or errno from
2949          * xfs_trans_commit. */
2950
2951 std_return:
2952         if ( (created || (error != 0 && dm_event_sent != 0)) &&
2953                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2954                                                 DM_EVENT_POSTCREATE)) {
2955                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2956                                         dir_vp, DM_RIGHT_NULL,
2957                                         created ? XFS_ITOV(cdp):NULL,
2958                                         DM_RIGHT_NULL,
2959                                         dir_name, NULL,
2960                                         dm_di_mode, error, 0);
2961         }
2962         return error;
2963
2964  error2:
2965  error1:
2966         xfs_bmap_cancel(&free_list);
2967  abort_return:
2968         cancel_flags |= XFS_TRANS_ABORT;
2969  error_return:
2970         xfs_trans_cancel(tp, cancel_flags);
2971         XFS_QM_DQRELE(mp, udqp);
2972         XFS_QM_DQRELE(mp, gdqp);
2973
2974         if (!dp_joined_to_trans && (dp != NULL)) {
2975                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2976         }
2977
2978         goto std_return;
2979 }
2980
2981
2982 /*
2983  * xfs_rmdir
2984  *
2985  */
2986 STATIC int
2987 xfs_rmdir(
2988         bhv_desc_t              *dir_bdp,
2989         vname_t                 *dentry,
2990         cred_t                  *credp)
2991 {
2992         char                    *name = VNAME(dentry);
2993         xfs_inode_t             *dp;
2994         xfs_inode_t             *cdp;   /* child directory */
2995         xfs_trans_t             *tp;
2996         xfs_mount_t             *mp;
2997         int                     error;
2998         xfs_bmap_free_t         free_list;
2999         xfs_fsblock_t           first_block;
3000         int                     cancel_flags;
3001         int                     committed;
3002         vnode_t                 *dir_vp;
3003         int                     dm_di_mode = 0;
3004         int                     last_cdp_link;
3005         int                     namelen;
3006         uint                    resblks;
3007
3008         dir_vp = BHV_TO_VNODE(dir_bdp);
3009         dp = XFS_BHVTOI(dir_bdp);
3010         mp = dp->i_mount;
3011
3012         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3013
3014         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3015                 return XFS_ERROR(EIO);
3016         namelen = VNAMELEN(dentry);
3017
3018         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3019                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3020                                         dir_vp, DM_RIGHT_NULL,
3021                                         NULL, DM_RIGHT_NULL,
3022                                         name, NULL, 0, 0, 0);
3023                 if (error)
3024                         return XFS_ERROR(error);
3025         }
3026
3027         /* Return through std_return after this point. */
3028
3029         cdp = NULL;
3030
3031         /*
3032          * We need to get a reference to cdp before we get our log
3033          * reservation.  The reason for this is that we cannot call
3034          * xfs_iget for an inode for which we do not have a reference
3035          * once we've acquired a log reservation.  This is because the
3036          * inode we are trying to get might be in xfs_inactive going
3037          * for a log reservation.  Since we'll have to wait for the
3038          * inactive code to complete before returning from xfs_iget,
3039          * we need to make sure that we don't have log space reserved
3040          * when we call xfs_iget.  Instead we get an unlocked referece
3041          * to the inode before getting our log reservation.
3042          */
3043         error = xfs_get_dir_entry(dentry, &cdp);
3044         if (error) {
3045                 REMOVE_DEBUG_TRACE(__LINE__);
3046                 goto std_return;
3047         }
3048         mp = dp->i_mount;
3049         dm_di_mode = cdp->i_d.di_mode;
3050
3051         /*
3052          * Get the dquots for the inodes.
3053          */
3054         error = XFS_QM_DQATTACH(mp, dp, 0);
3055         if (!error && dp != cdp)
3056                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3057         if (error) {
3058                 IRELE(cdp);
3059                 REMOVE_DEBUG_TRACE(__LINE__);
3060                 goto std_return;
3061         }
3062
3063         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3064         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3065         /*
3066          * We try to get the real space reservation first,
3067          * allowing for directory btree deletion(s) implying
3068          * possible bmap insert(s).  If we can't get the space
3069          * reservation then we use 0 instead, and avoid the bmap
3070          * btree insert(s) in the directory code by, if the bmap
3071          * insert tries to happen, instead trimming the LAST
3072          * block from the directory.
3073          */
3074         resblks = XFS_REMOVE_SPACE_RES(mp);
3075         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3076                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3077         if (error == ENOSPC) {
3078                 resblks = 0;
3079                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3080                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3081         }
3082         if (error) {
3083                 ASSERT(error != ENOSPC);
3084                 cancel_flags = 0;
3085                 IRELE(cdp);
3086                 goto error_return;
3087         }
3088         XFS_BMAP_INIT(&free_list, &first_block);
3089
3090         /*
3091          * Now lock the child directory inode and the parent directory
3092          * inode in the proper order.  This will take care of validating
3093          * that the directory entry for the child directory inode has
3094          * not changed while we were obtaining a log reservation.
3095          */
3096         error = xfs_lock_dir_and_entry(dp, dentry, cdp);
3097         if (error) {
3098                 xfs_trans_cancel(tp, cancel_flags);
3099                 IRELE(cdp);
3100                 goto std_return;
3101         }
3102
3103         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3104         if (dp != cdp) {
3105                 /*
3106                  * Only increment the parent directory vnode count if
3107                  * we didn't bump it in looking up cdp.  The only time
3108                  * we don't bump it is when we're looking up ".".
3109                  */
3110                 VN_HOLD(dir_vp);
3111         }
3112
3113         ITRACE(cdp);
3114         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3115
3116         ASSERT(cdp->i_d.di_nlink >= 2);
3117         if (cdp->i_d.di_nlink != 2) {
3118                 error = XFS_ERROR(ENOTEMPTY);
3119                 goto error_return;
3120         }
3121         if (!XFS_DIR_ISEMPTY(mp, cdp)) {
3122                 error = XFS_ERROR(ENOTEMPTY);
3123                 goto error_return;
3124         }
3125
3126         error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
3127                 &first_block, &free_list, resblks);
3128         if (error) {
3129                 goto error1;
3130         }
3131
3132         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3133
3134         /*
3135          * Bump the in memory generation count on the parent
3136          * directory so that other can know that it has changed.
3137          */
3138         dp->i_gen++;
3139
3140         /*
3141          * Drop the link from cdp's "..".
3142          */
3143         error = xfs_droplink(tp, dp);
3144         if (error) {
3145                 goto error1;
3146         }
3147
3148         /*
3149          * Drop the link from dp to cdp.
3150          */
3151         error = xfs_droplink(tp, cdp);
3152         if (error) {
3153                 goto error1;
3154         }
3155
3156         /*
3157          * Drop the "." link from cdp to self.
3158          */
3159         error = xfs_droplink(tp, cdp);
3160         if (error) {
3161                 goto error1;
3162         }
3163
3164         /* Determine these before committing transaction */
3165         last_cdp_link = (cdp)->i_d.di_nlink==0;
3166
3167         /*
3168          * Take an extra ref on the child vnode so that it
3169          * does not go to xfs_inactive() from within the commit.
3170          */
3171         IHOLD(cdp);
3172
3173         /*
3174          * If this is a synchronous mount, make sure that the
3175          * rmdir transaction goes to disk before returning to
3176          * the user.
3177          */
3178         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3179                 xfs_trans_set_sync(tp);
3180         }
3181
3182         error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
3183         if (error) {
3184                 xfs_bmap_cancel(&free_list);
3185                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3186                                  XFS_TRANS_ABORT));
3187                 IRELE(cdp);
3188                 goto std_return;
3189         }
3190
3191         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3192         if (error) {
3193                 IRELE(cdp);
3194                 goto std_return;
3195         }
3196
3197
3198         /*
3199          * Let interposed file systems know about removed links.
3200          */
3201         VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3202
3203         IRELE(cdp);
3204
3205         /* Fall through to std_return with error = 0 or the errno
3206          * from xfs_trans_commit. */
3207 std_return:
3208         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3209                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3210                                         dir_vp, DM_RIGHT_NULL,
3211                                         NULL, DM_RIGHT_NULL,
3212                                         name, NULL, dm_di_mode,
3213                                         error, 0);
3214         }
3215         return error;
3216
3217  error1:
3218         xfs_bmap_cancel(&free_list);
3219         cancel_flags |= XFS_TRANS_ABORT;
3220  error_return:
3221         xfs_trans_cancel(tp, cancel_flags);
3222         goto std_return;
3223 }
3224
3225
3226 /*
3227  * xfs_readdir
3228  *
3229  * Read dp's entries starting at uiop->uio_offset and translate them into
3230  * bufsize bytes worth of struct dirents starting at bufbase.
3231  */
3232 STATIC int
3233 xfs_readdir(
3234         bhv_desc_t      *dir_bdp,
3235         uio_t           *uiop,
3236         cred_t          *credp,
3237         int             *eofp)
3238 {
3239         xfs_inode_t     *dp;
3240         xfs_trans_t     *tp = NULL;
3241         int             error = 0;
3242         uint            lock_mode;
3243         xfs_off_t       start_offset;
3244
3245         vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3246                                                (inst_t *)__return_address);
3247         dp = XFS_BHVTOI(dir_bdp);
3248
3249         if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
3250                 return XFS_ERROR(EIO);
3251         }
3252
3253         lock_mode = xfs_ilock_map_shared(dp);
3254         start_offset = uiop->uio_offset;
3255         error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
3256         if (start_offset != uiop->uio_offset) {
3257                 xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
3258         }
3259         xfs_iunlock_map_shared(dp, lock_mode);
3260         return error;
3261 }
3262
3263
3264 /*
3265  * xfs_symlink
3266  *
3267  */
3268 STATIC int
3269 xfs_symlink(
3270         bhv_desc_t              *dir_bdp,
3271         vname_t                 *dentry,
3272         vattr_t                 *vap,
3273         char                    *target_path,
3274         vnode_t                 **vpp,
3275         cred_t                  *credp)
3276 {
3277         xfs_trans_t             *tp;
3278         xfs_mount_t             *mp;
3279         xfs_inode_t             *dp;
3280         xfs_inode_t             *ip;
3281         int                     error;
3282         int                     pathlen;
3283         xfs_bmap_free_t         free_list;
3284         xfs_fsblock_t           first_block;
3285         boolean_t               dp_joined_to_trans;
3286         vnode_t                 *dir_vp;
3287         uint                    cancel_flags;
3288         int                     committed;
3289         xfs_fileoff_t           first_fsb;
3290         xfs_filblks_t           fs_blocks;
3291         int                     nmaps;
3292         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3293         xfs_daddr_t             d;
3294         char                    *cur_chunk;
3295         int                     byte_cnt;
3296         int                     n;
3297         xfs_buf_t               *bp;
3298         xfs_prid_t              prid;
3299         struct xfs_dquot        *udqp, *gdqp;
3300         uint                    resblks;
3301         char                    *link_name = VNAME(dentry);
3302         int                     link_namelen;
3303
3304         *vpp = NULL;
3305         dir_vp = BHV_TO_VNODE(dir_bdp);
3306         dp = XFS_BHVTOI(dir_bdp);
3307         dp_joined_to_trans = B_FALSE;
3308         error = 0;
3309         ip = NULL;
3310         tp = NULL;
3311
3312         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3313
3314         mp = dp->i_mount;
3315
3316         if (XFS_FORCED_SHUTDOWN(mp))
3317                 return XFS_ERROR(EIO);
3318
3319         link_namelen = VNAMELEN(dentry);
3320
3321         /*
3322          * Check component lengths of the target path name.
3323          */
3324         pathlen = strlen(target_path);
3325         if (pathlen >= MAXPATHLEN)      /* total string too long */
3326                 return XFS_ERROR(ENAMETOOLONG);
3327         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3328                 int len, total;
3329                 char *path;
3330
3331                 for(total = 0, path = target_path; total < pathlen;) {
3332                         /*
3333                          * Skip any slashes.
3334                          */
3335                         while(*path == '/') {
3336                                 total++;
3337                                 path++;
3338                         }
3339
3340                         /*
3341                          * Count up to the next slash or end of path.
3342                          * Error out if the component is bigger than MAXNAMELEN.
3343                          */
3344                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3345                                 if (++len >= MAXNAMELEN) {
3346                                         error = ENAMETOOLONG;
3347                                         return error;
3348                                 }
3349                         }
3350                 }
3351         }
3352
3353         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3354                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3355                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3356                                         link_name, target_path, 0, 0, 0);
3357                 if (error)
3358                         return error;
3359         }
3360
3361         /* Return through std_return after this point. */
3362
3363         udqp = gdqp = NULL;
3364         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3365                 prid = dp->i_d.di_projid;
3366         else if (vap->va_mask & XFS_AT_PROJID)
3367                 prid = (xfs_prid_t)vap->va_projid;
3368         else
3369                 prid = (xfs_prid_t)dfltprid;
3370
3371         /*
3372          * Make sure that we have allocated dquot(s) on disk.
3373          */
3374         error = XFS_QM_DQVOPALLOC(mp, dp,
3375                         current_fsuid(credp), current_fsgid(credp), prid,
3376                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3377         if (error)
3378                 goto std_return;
3379
3380         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3381         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3382         /*
3383          * The symlink will fit into the inode data fork?
3384          * There can't be any attributes so we get the whole variable part.
3385          */
3386         if (pathlen <= XFS_LITINO(mp))
3387                 fs_blocks = 0;
3388         else
3389                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3390         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3391         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3392                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3393         if (error == ENOSPC && fs_blocks == 0) {
3394                 resblks = 0;
3395                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3396                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3397         }
3398         if (error) {
3399                 cancel_flags = 0;
3400                 dp = NULL;
3401                 goto error_return;
3402         }
3403
3404         xfs_ilock(dp, XFS_ILOCK_EXCL);
3405
3406         /*
3407          * Check whether the directory allows new symlinks or not.
3408          */
3409         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3410                 error = XFS_ERROR(EPERM);
3411                 goto error_return;
3412         }
3413
3414         /*
3415          * Reserve disk quota : blocks and inode.
3416          */
3417         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3418         if (error)
3419                 goto error_return;
3420
3421         /*
3422          * Check for ability to enter directory entry, if no space reserved.
3423          */
3424         if (resblks == 0 &&
3425             (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
3426                 goto error_return;
3427         /*
3428          * Initialize the bmap freelist prior to calling either
3429          * bmapi or the directory create code.
3430          */
3431         XFS_BMAP_INIT(&free_list, &first_block);
3432
3433         /*
3434          * Allocate an inode for the symlink.
3435          */
3436         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3437                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3438         if (error) {
3439                 if (error == ENOSPC)
3440                         goto error_return;
3441                 goto error1;
3442         }
3443         ITRACE(ip);
3444
3445         VN_HOLD(dir_vp);
3446         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3447         dp_joined_to_trans = B_TRUE;
3448
3449         /*
3450          * Also attach the dquot(s) to it, if applicable.
3451          */
3452         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3453
3454         if (resblks)
3455                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3456         /*
3457          * If the symlink will fit into the inode, write it inline.
3458          */
3459         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3460                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3461                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3462                 ip->i_d.di_size = pathlen;
3463
3464                 /*
3465                  * The inode was initially created in extent format.
3466                  */
3467                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3468                 ip->i_df.if_flags |= XFS_IFINLINE;
3469
3470                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3471                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3472
3473         } else {
3474                 first_fsb = 0;
3475                 nmaps = SYMLINK_MAPS;
3476
3477                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3478                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3479                                   &first_block, resblks, mval, &nmaps,
3480                                   &free_list);
3481                 if (error) {
3482                         goto error1;
3483                 }
3484
3485                 if (resblks)
3486                         resblks -= fs_blocks;
3487                 ip->i_d.di_size = pathlen;
3488                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3489
3490                 cur_chunk = target_path;
3491                 for (n = 0; n < nmaps; n++) {
3492                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3493                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3494                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3495                                                BTOBB(byte_cnt), 0);
3496                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3497                         if (pathlen < byte_cnt) {
3498                                 byte_cnt = pathlen;
3499                         }
3500                         pathlen -= byte_cnt;
3501
3502                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3503                         cur_chunk += byte_cnt;
3504
3505                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3506                 }
3507         }
3508
3509         /*
3510          * Create the directory entry for the symlink.
3511          */
3512         error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
3513                         ip->i_ino, &first_block, &free_list, resblks);
3514         if (error) {
3515                 goto error1;
3516         }
3517         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3518         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3519
3520         /*
3521          * Bump the in memory version number of the parent directory
3522          * so that other processes accessing it will recognize that
3523          * the directory has changed.
3524          */
3525         dp->i_gen++;
3526
3527         /*
3528          * If this is a synchronous mount, make sure that the
3529          * symlink transaction goes to disk before returning to
3530          * the user.
3531          */
3532         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3533                 xfs_trans_set_sync(tp);
3534         }
3535
3536         /*
3537          * xfs_trans_commit normally decrements the vnode ref count
3538          * when it unlocks the inode. Since we want to return the
3539          * vnode to the caller, we bump the vnode ref count now.
3540          */
3541         IHOLD(ip);
3542
3543         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
3544         if (error) {
3545                 goto error2;
3546         }
3547         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3548         XFS_QM_DQRELE(mp, udqp);
3549         XFS_QM_DQRELE(mp, gdqp);
3550
3551         /* Fall through to std_return with error = 0 or errno from
3552          * xfs_trans_commit     */
3553 std_return:
3554         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3555                              DM_EVENT_POSTSYMLINK)) {
3556                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3557                                         dir_vp, DM_RIGHT_NULL,
3558                                         error ? NULL : XFS_ITOV(ip),
3559                                         DM_RIGHT_NULL, link_name, target_path,
3560                                         0, error, 0);
3561         }
3562
3563         if (!error) {
3564                 vnode_t *vp;
3565
3566                 ASSERT(ip);
3567                 vp = XFS_ITOV(ip);
3568                 *vpp = vp;
3569         }
3570         return error;
3571
3572  error2:
3573         IRELE(ip);
3574  error1:
3575         xfs_bmap_cancel(&free_list);
3576         cancel_flags |= XFS_TRANS_ABORT;
3577  error_return:
3578         xfs_trans_cancel(tp, cancel_flags);
3579         XFS_QM_DQRELE(mp, udqp);
3580         XFS_QM_DQRELE(mp, gdqp);
3581
3582         if (!dp_joined_to_trans && (dp != NULL)) {
3583                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3584         }
3585
3586         goto std_return;
3587 }
3588
3589
3590 /*
3591  * xfs_fid2
3592  *
3593  * A fid routine that takes a pointer to a previously allocated
3594  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3595  */
3596 STATIC int
3597 xfs_fid2(
3598         bhv_desc_t      *bdp,
3599         fid_t           *fidp)
3600 {
3601         xfs_inode_t     *ip;
3602         xfs_fid2_t      *xfid;
3603
3604         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3605                                        (inst_t *)__return_address);
3606         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3607
3608         xfid = (xfs_fid2_t *)fidp;
3609         ip = XFS_BHVTOI(bdp);
3610         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3611         xfid->fid_pad = 0;
3612         /*
3613          * use memcpy because the inode is a long long and there's no
3614          * assurance that xfid->fid_ino is properly aligned.
3615          */
3616         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3617         xfid->fid_gen = ip->i_d.di_gen;
3618
3619         return 0;
3620 }
3621
3622
3623 /*
3624  * xfs_rwlock
3625  */
3626 int
3627 xfs_rwlock(
3628         bhv_desc_t      *bdp,
3629         vrwlock_t       locktype)
3630 {
3631         xfs_inode_t     *ip;
3632         vnode_t         *vp;
3633
3634         vp = BHV_TO_VNODE(bdp);
3635         if (VN_ISDIR(vp))
3636                 return 1;
3637         ip = XFS_BHVTOI(bdp);
3638         if (locktype == VRWLOCK_WRITE) {
3639                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3640         } else if (locktype == VRWLOCK_TRY_READ) {
3641                 return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED));
3642         } else if (locktype == VRWLOCK_TRY_WRITE) {
3643                 return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL));
3644         } else {
3645                 ASSERT((locktype == VRWLOCK_READ) ||
3646                        (locktype == VRWLOCK_WRITE_DIRECT));
3647                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3648         }
3649
3650         return 1;
3651 }
3652
3653
3654 /*
3655  * xfs_rwunlock
3656  */
3657 void
3658 xfs_rwunlock(
3659         bhv_desc_t      *bdp,
3660         vrwlock_t       locktype)
3661 {
3662         xfs_inode_t     *ip;
3663         vnode_t         *vp;
3664
3665         vp = BHV_TO_VNODE(bdp);
3666         if (VN_ISDIR(vp))
3667                 return;
3668         ip = XFS_BHVTOI(bdp);
3669         if (locktype == VRWLOCK_WRITE) {
3670                 /*
3671                  * In the write case, we may have added a new entry to
3672                  * the reference cache.  This might store a pointer to
3673                  * an inode to be released in this inode.  If it is there,
3674                  * clear the pointer and release the inode after unlocking
3675                  * this one.
3676                  */
3677                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3678         } else {
3679                 ASSERT((locktype == VRWLOCK_READ) ||
3680                        (locktype == VRWLOCK_WRITE_DIRECT));
3681                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3682         }
3683         return;
3684 }
3685
3686 STATIC int
3687 xfs_inode_flush(
3688         bhv_desc_t      *bdp,
3689         int             flags)
3690 {
3691         xfs_inode_t     *ip;
3692         xfs_mount_t     *mp;
3693         xfs_inode_log_item_t *iip;
3694         int             error = 0;
3695
3696         ip = XFS_BHVTOI(bdp);
3697         mp = ip->i_mount;
3698         iip = ip->i_itemp;
3699
3700         if (XFS_FORCED_SHUTDOWN(mp))
3701                 return XFS_ERROR(EIO);
3702
3703         /*
3704          * Bypass inodes which have already been cleaned by
3705          * the inode flush clustering code inside xfs_iflush
3706          */
3707         if ((ip->i_update_core == 0) &&
3708             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3709                 return 0;
3710
3711         if (flags & FLUSH_LOG) {
3712                 if (iip && iip->ili_last_lsn) {
3713                         xlog_t          *log = mp->m_log;
3714                         xfs_lsn_t       sync_lsn;
3715                         int             s, log_flags = XFS_LOG_FORCE;
3716
3717                         s = GRANT_LOCK(log);
3718                         sync_lsn = log->l_last_sync_lsn;
3719                         GRANT_UNLOCK(log, s);
3720
3721                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3722                                 return 0;
3723
3724                         if (flags & FLUSH_SYNC)
3725                                 log_flags |= XFS_LOG_SYNC;
3726                         return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3727                 }
3728         }
3729
3730         /*
3731          * We make this non-blocking if the inode is contended,
3732          * return EAGAIN to indicate to the caller that they
3733          * did not succeed. This prevents the flush path from
3734          * blocking on inodes inside another operation right
3735          * now, they get caught later by xfs_sync.
3736          */
3737         if (flags & FLUSH_INODE) {
3738                 int     flush_flags;
3739
3740                 if (xfs_ipincount(ip))
3741                         return EAGAIN;
3742
3743                 if (flags & FLUSH_SYNC) {
3744                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3745                         xfs_iflock(ip);
3746                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3747                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3748                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3749                                 return EAGAIN;
3750                         }
3751                 } else {
3752                         return EAGAIN;
3753                 }
3754
3755                 if (flags & FLUSH_SYNC)
3756                         flush_flags = XFS_IFLUSH_SYNC;
3757                 else
3758                         flush_flags = XFS_IFLUSH_ASYNC;
3759
3760                 error = xfs_iflush(ip, flush_flags);
3761                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3762         }
3763
3764         return error;
3765 }
3766
3767
3768 int
3769 xfs_set_dmattrs (
3770         bhv_desc_t      *bdp,
3771         u_int           evmask,
3772         u_int16_t       state,
3773         cred_t          *credp)
3774 {
3775         xfs_inode_t     *ip;
3776         xfs_trans_t     *tp;
3777         xfs_mount_t     *mp;
3778         int             error;
3779
3780         if (!capable(CAP_SYS_ADMIN))
3781                 return XFS_ERROR(EPERM);
3782
3783         ip = XFS_BHVTOI(bdp);
3784         mp = ip->i_mount;
3785
3786         if (XFS_FORCED_SHUTDOWN(mp))
3787                 return XFS_ERROR(EIO);
3788
3789         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3790         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3791         if (error) {
3792                 xfs_trans_cancel(tp, 0);
3793                 return error;
3794         }
3795         xfs_ilock(ip, XFS_ILOCK_EXCL);
3796         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3797
3798         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3799         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3800
3801         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3802         IHOLD(ip);
3803         error = xfs_trans_commit(tp, 0, NULL);
3804
3805         return error;
3806 }
3807
3808
3809 /*
3810  * xfs_reclaim
3811  */
3812 STATIC int
3813 xfs_reclaim(
3814         bhv_desc_t      *bdp)
3815 {
3816         xfs_inode_t     *ip;
3817         vnode_t         *vp;
3818
3819         vp = BHV_TO_VNODE(bdp);
3820         ip = XFS_BHVTOI(bdp);
3821
3822         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3823
3824         ASSERT(!VN_MAPPED(vp));
3825
3826         /* bad inode, get out here ASAP */
3827         if (VN_BAD(vp)) {
3828                 xfs_ireclaim(ip);
3829                 return 0;
3830         }
3831
3832         vn_iowait(vp);
3833
3834         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3835         ASSERT(VN_CACHED(vp) == 0);
3836
3837         /* If we have nothing to flush with this inode then complete the
3838          * teardown now, otherwise break the link between the xfs inode
3839          * and the linux inode and clean up the xfs inode later. This
3840          * avoids flushing the inode to disk during the delete operation
3841          * itself.
3842          */
3843         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3844                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3845                 xfs_iflock(ip);
3846                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3847         } else {
3848                 xfs_mount_t     *mp = ip->i_mount;
3849
3850                 /* Protect sync from us */
3851                 XFS_MOUNT_ILOCK(mp);
3852                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3853                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3854                 ip->i_flags |= XFS_IRECLAIMABLE;
3855                 XFS_MOUNT_IUNLOCK(mp);
3856         }
3857         return 0;
3858 }
3859
3860 int
3861 xfs_finish_reclaim(
3862         xfs_inode_t     *ip,
3863         int             locked,
3864         int             sync_mode)
3865 {
3866         xfs_ihash_t     *ih = ip->i_hash;
3867         vnode_t         *vp = XFS_ITOV_NULL(ip);
3868         int             error;
3869
3870         if (vp && VN_BAD(vp))
3871                 goto reclaim;
3872
3873         /* The hash lock here protects a thread in xfs_iget_core from
3874          * racing with us on linking the inode back with a vnode.
3875          * Once we have the XFS_IRECLAIM flag set it will not touch
3876          * us.
3877          */
3878         write_lock(&ih->ih_lock);
3879         if ((ip->i_flags & XFS_IRECLAIM) ||
3880             (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) {
3881                 write_unlock(&ih->ih_lock);
3882                 if (locked) {
3883                         xfs_ifunlock(ip);
3884                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3885                 }
3886                 return(1);
3887         }
3888         ip->i_flags |= XFS_IRECLAIM;
3889         write_unlock(&ih->ih_lock);
3890
3891         /*
3892          * If the inode is still dirty, then flush it out.  If the inode
3893          * is not in the AIL, then it will be OK to flush it delwri as
3894          * long as xfs_iflush() does not keep any references to the inode.
3895          * We leave that decision up to xfs_iflush() since it has the
3896          * knowledge of whether it's OK to simply do a delwri flush of
3897          * the inode or whether we need to wait until the inode is
3898          * pulled from the AIL.
3899          * We get the flush lock regardless, though, just to make sure
3900          * we don't free it while it is being flushed.
3901          */
3902         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3903                 if (!locked) {
3904                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3905                         xfs_iflock(ip);
3906                 }
3907
3908                 if (ip->i_update_core ||
3909                     ((ip->i_itemp != NULL) &&
3910                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3911                         error = xfs_iflush(ip, sync_mode);
3912                         /*
3913                          * If we hit an error, typically because of filesystem
3914                          * shutdown, we don't need to let vn_reclaim to know
3915                          * because we're gonna reclaim the inode anyway.
3916                          */
3917                         if (error) {
3918                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3919                                 goto reclaim;
3920                         }
3921                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3922                 }
3923
3924                 ASSERT(ip->i_update_core == 0);
3925                 ASSERT(ip->i_itemp == NULL ||
3926                        ip->i_itemp->ili_format.ilf_fields == 0);
3927                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3928         } else if (locked) {
3929                 /*
3930                  * We are not interested in doing an iflush if we're
3931                  * in the process of shutting down the filesystem forcibly.
3932                  * So, just reclaim the inode.
3933                  */
3934                 xfs_ifunlock(ip);
3935                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3936         }
3937
3938  reclaim:
3939         xfs_ireclaim(ip);
3940         return 0;
3941 }
3942
3943 int
3944 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3945 {
3946         int             purged;
3947         xfs_inode_t     *ip, *n;
3948         int             done = 0;
3949
3950         while (!done) {
3951                 purged = 0;
3952                 XFS_MOUNT_ILOCK(mp);
3953                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3954                         if (noblock) {
3955                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3956                                         continue;
3957                                 if (xfs_ipincount(ip) ||
3958                                     !xfs_iflock_nowait(ip)) {
3959                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3960                                         continue;
3961                                 }
3962                         }
3963                         XFS_MOUNT_IUNLOCK(mp);
3964                         if (xfs_finish_reclaim(ip, noblock,
3965                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3966                                 delay(1);
3967                         purged = 1;
3968                         break;
3969                 }
3970
3971                 done = !purged;
3972         }
3973
3974         XFS_MOUNT_IUNLOCK(mp);
3975         return 0;
3976 }
3977
3978 /*
3979  * xfs_alloc_file_space()
3980  *      This routine allocates disk space for the given file.
3981  *
3982  *      If alloc_type == 0, this request is for an ALLOCSP type
3983  *      request which will change the file size.  In this case, no
3984  *      DMAPI event will be generated by the call.  A TRUNCATE event
3985  *      will be generated later by xfs_setattr.
3986  *
3987  *      If alloc_type != 0, this request is for a RESVSP type
3988  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3989  *      lower block boundary byte address is less than the file's
3990  *      length.
3991  *
3992  * RETURNS:
3993  *       0 on success
3994  *      errno on error
3995  *
3996  */
3997 STATIC int
3998 xfs_alloc_file_space(
3999         xfs_inode_t             *ip,
4000         xfs_off_t               offset,
4001         xfs_off_t               len,
4002         int                     alloc_type,
4003         int                     attr_flags)
4004 {
4005         xfs_filblks_t           allocated_fsb;
4006         xfs_filblks_t           allocatesize_fsb;
4007         int                     committed;
4008         xfs_off_t               count;
4009         xfs_filblks_t           datablocks;
4010         int                     error;
4011         xfs_fsblock_t           firstfsb;
4012         xfs_bmap_free_t         free_list;
4013         xfs_bmbt_irec_t         *imapp;
4014         xfs_bmbt_irec_t         imaps[1];
4015         xfs_mount_t             *mp;
4016         int                     numrtextents;
4017         int                     reccount;
4018         uint                    resblks;
4019         int                     rt;
4020         int                     rtextsize;
4021         xfs_fileoff_t           startoffset_fsb;
4022         xfs_trans_t             *tp;
4023         int                     xfs_bmapi_flags;
4024
4025         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4026         mp = ip->i_mount;
4027
4028         if (XFS_FORCED_SHUTDOWN(mp))
4029                 return XFS_ERROR(EIO);
4030
4031         /*
4032          * determine if this is a realtime file
4033          */
4034         if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) {
4035                 if (ip->i_d.di_extsize)
4036                         rtextsize = ip->i_d.di_extsize;
4037                 else
4038                         rtextsize = mp->m_sb.sb_rextsize;
4039         } else
4040                 rtextsize = 0;
4041
4042         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4043                 return error;
4044
4045         if (len <= 0)
4046                 return XFS_ERROR(EINVAL);
4047
4048         count = len;
4049         error = 0;
4050         imapp = &imaps[0];
4051         reccount = 1;
4052         xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4053         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4054         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4055
4056         /*      Generate a DMAPI event if needed.       */
4057         if (alloc_type != 0 && offset < ip->i_d.di_size &&
4058                         (attr_flags&ATTR_DMI) == 0  &&
4059                         DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4060                 xfs_off_t           end_dmi_offset;
4061
4062                 end_dmi_offset = offset+len;
4063                 if (end_dmi_offset > ip->i_d.di_size)
4064                         end_dmi_offset = ip->i_d.di_size;
4065                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4066                         offset, end_dmi_offset - offset,
4067                         0, NULL);
4068                 if (error)
4069                         return(error);
4070         }
4071
4072         /*
4073          * allocate file space until done or until there is an error
4074          */
4075 retry:
4076         while (allocatesize_fsb && !error) {
4077                 /*
4078                  * determine if reserving space on
4079                  * the data or realtime partition.
4080                  */
4081                 if (rt) {
4082                         xfs_fileoff_t s, e;
4083
4084                         s = startoffset_fsb;
4085                         do_div(s, rtextsize);
4086                         s *= rtextsize;
4087                         e = roundup_64(startoffset_fsb + allocatesize_fsb,
4088                                 rtextsize);
4089                         numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize;
4090                         datablocks = 0;
4091                 } else {
4092                         datablocks = allocatesize_fsb;
4093                         numrtextents = 0;
4094                 }
4095
4096                 /*
4097                  * allocate and setup the transaction
4098                  */
4099                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4100                 resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
4101                 error = xfs_trans_reserve(tp,
4102                                           resblks,
4103                                           XFS_WRITE_LOG_RES(mp),
4104                                           numrtextents,
4105                                           XFS_TRANS_PERM_LOG_RES,
4106                                           XFS_WRITE_LOG_COUNT);
4107
4108                 /*
4109                  * check for running out of space
4110                  */
4111                 if (error) {
4112                         /*
4113                          * Free the transaction structure.
4114                          */
4115                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4116                         xfs_trans_cancel(tp, 0);
4117                         break;
4118                 }
4119                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4120                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4121                                 ip->i_udquot, ip->i_gdquot, resblks, 0, 0);
4122                 if (error)
4123                         goto error1;
4124
4125                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4126                 xfs_trans_ihold(tp, ip);
4127
4128                 /*
4129                  * issue the bmapi() call to allocate the blocks
4130                  */
4131                 XFS_BMAP_INIT(&free_list, &firstfsb);
4132                 error = xfs_bmapi(tp, ip, startoffset_fsb,
4133                                   allocatesize_fsb, xfs_bmapi_flags,
4134                                   &firstfsb, 0, imapp, &reccount,
4135                                   &free_list);
4136                 if (error) {
4137                         goto error0;
4138                 }
4139
4140                 /*
4141                  * complete the transaction
4142                  */
4143                 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4144                 if (error) {
4145                         goto error0;
4146                 }
4147
4148                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4149                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4150                 if (error) {
4151                         break;
4152                 }
4153
4154                 allocated_fsb = imapp->br_blockcount;
4155
4156                 if (reccount == 0) {
4157                         error = XFS_ERROR(ENOSPC);
4158                         break;
4159                 }
4160
4161                 startoffset_fsb += allocated_fsb;
4162                 allocatesize_fsb -= allocated_fsb;
4163         }
4164 dmapi_enospc_check:
4165         if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4166             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4167
4168                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4169                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4170                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4171                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4172                 if (error == 0)
4173                         goto retry;     /* Maybe DMAPI app. has made space */
4174                 /* else fall through with error from XFS_SEND_DATA */
4175         }
4176
4177         return error;
4178
4179  error0:
4180         xfs_bmap_cancel(&free_list);
4181  error1:
4182         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4183         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4184         goto dmapi_enospc_check;
4185 }
4186
4187 /*
4188  * Zero file bytes between startoff and endoff inclusive.
4189  * The iolock is held exclusive and no blocks are buffered.
4190  */
4191 STATIC int
4192 xfs_zero_remaining_bytes(
4193         xfs_inode_t             *ip,
4194         xfs_off_t               startoff,
4195         xfs_off_t               endoff)
4196 {
4197         xfs_bmbt_irec_t         imap;
4198         xfs_fileoff_t           offset_fsb;
4199         xfs_off_t               lastoffset;
4200         xfs_off_t               offset;
4201         xfs_buf_t               *bp;
4202         xfs_mount_t             *mp = ip->i_mount;
4203         int                     nimap;
4204         int                     error = 0;
4205
4206         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4207                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4208                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4209
4210         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4211                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4212                 nimap = 1;
4213                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
4214                         &nimap, NULL);
4215                 if (error || nimap < 1)
4216                         break;
4217                 ASSERT(imap.br_blockcount >= 1);
4218                 ASSERT(imap.br_startoff == offset_fsb);
4219                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4220                 if (lastoffset > endoff)
4221                         lastoffset = endoff;
4222                 if (imap.br_startblock == HOLESTARTBLOCK)
4223                         continue;
4224                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4225                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4226                         continue;
4227                 XFS_BUF_UNDONE(bp);
4228                 XFS_BUF_UNWRITE(bp);
4229                 XFS_BUF_READ(bp);
4230                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4231                 xfsbdstrat(mp, bp);
4232                 if ((error = xfs_iowait(bp))) {
4233                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4234                                           mp, bp, XFS_BUF_ADDR(bp));
4235                         break;
4236                 }
4237                 memset(XFS_BUF_PTR(bp) +
4238                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4239                       0, lastoffset - offset + 1);
4240                 XFS_BUF_UNDONE(bp);
4241                 XFS_BUF_UNREAD(bp);
4242                 XFS_BUF_WRITE(bp);
4243                 xfsbdstrat(mp, bp);
4244                 if ((error = xfs_iowait(bp))) {
4245                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4246                                           mp, bp, XFS_BUF_ADDR(bp));
4247                         break;
4248                 }
4249         }
4250         xfs_buf_free(bp);
4251         return error;
4252 }
4253
4254 /*
4255  * xfs_free_file_space()
4256  *      This routine frees disk space for the given file.
4257  *
4258  *      This routine is only called by xfs_change_file_space
4259  *      for an UNRESVSP type call.
4260  *
4261  * RETURNS:
4262  *       0 on success
4263  *      errno on error
4264  *
4265  */
4266 STATIC int
4267 xfs_free_file_space(
4268         xfs_inode_t             *ip,
4269         xfs_off_t               offset,
4270         xfs_off_t               len,
4271         int                     attr_flags)
4272 {
4273         vnode_t                 *vp;
4274         int                     committed;
4275         int                     done;
4276         xfs_off_t               end_dmi_offset;
4277         xfs_fileoff_t           endoffset_fsb;
4278         int                     error;
4279         xfs_fsblock_t           firstfsb;
4280         xfs_bmap_free_t         free_list;
4281         xfs_off_t               ilen;
4282         xfs_bmbt_irec_t         imap;
4283         xfs_off_t               ioffset;
4284         xfs_extlen_t            mod=0;
4285         xfs_mount_t             *mp;
4286         int                     nimap;
4287         uint                    resblks;
4288         int                     rounding;
4289         int                     rt;
4290         xfs_fileoff_t           startoffset_fsb;
4291         xfs_trans_t             *tp;
4292         int                     need_iolock = 1;
4293
4294         vp = XFS_ITOV(ip);
4295         mp = ip->i_mount;
4296
4297         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4298
4299         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4300                 return error;
4301
4302         error = 0;
4303         if (len <= 0)   /* if nothing being freed */
4304                 return error;
4305         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4306         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4307         end_dmi_offset = offset + len;
4308         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4309
4310         if (offset < ip->i_d.di_size &&
4311             (attr_flags & ATTR_DMI) == 0 &&
4312             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4313                 if (end_dmi_offset > ip->i_d.di_size)
4314                         end_dmi_offset = ip->i_d.di_size;
4315                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4316                                 offset, end_dmi_offset - offset,
4317                                 AT_DELAY_FLAG(attr_flags), NULL);
4318                 if (error)
4319                         return(error);
4320         }
4321
4322         ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1);
4323         if (attr_flags & ATTR_NOLOCK)
4324                 need_iolock = 0;
4325         if (need_iolock)
4326                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4327
4328         rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
4329                         (__uint8_t)NBPP);
4330         ilen = len + (offset & (rounding - 1));
4331         ioffset = offset & ~(rounding - 1);
4332         if (ilen & (rounding - 1))
4333                 ilen = (ilen + rounding) & ~(rounding - 1);
4334
4335         if (VN_CACHED(vp) != 0) {
4336                 xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4337                                 ctooff(offtoct(ioffset)), -1);
4338                 VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(ioffset)),
4339                                 -1, FI_REMAPF_LOCKED);
4340         }
4341
4342         /*
4343          * Need to zero the stuff we're not freeing, on disk.
4344          * If its a realtime file & can't use unwritten extents then we
4345          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4346          * will take care of it for us.
4347          */
4348         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4349                 nimap = 1;
4350                 error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
4351                         &imap, &nimap, NULL);
4352                 if (error)
4353                         goto out_unlock_iolock;
4354                 ASSERT(nimap == 0 || nimap == 1);
4355                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4356                         xfs_daddr_t     block;
4357
4358                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4359                         block = imap.br_startblock;
4360                         mod = do_div(block, mp->m_sb.sb_rextsize);
4361                         if (mod)
4362                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4363                 }
4364                 nimap = 1;
4365                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
4366                         &imap, &nimap, NULL);
4367                 if (error)
4368                         goto out_unlock_iolock;
4369                 ASSERT(nimap == 0 || nimap == 1);
4370                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4371                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4372                         mod++;
4373                         if (mod && (mod != mp->m_sb.sb_rextsize))
4374                                 endoffset_fsb -= mod;
4375                 }
4376         }
4377         if ((done = (endoffset_fsb <= startoffset_fsb)))
4378                 /*
4379                  * One contiguous piece to clear
4380                  */
4381                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4382         else {
4383                 /*
4384                  * Some full blocks, possibly two pieces to clear
4385                  */
4386                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4387                         error = xfs_zero_remaining_bytes(ip, offset,
4388                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4389                 if (!error &&
4390                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4391                         error = xfs_zero_remaining_bytes(ip,
4392                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4393                                 offset + len - 1);
4394         }
4395
4396         /*
4397          * free file space until done or until there is an error
4398          */
4399         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4400         while (!error && !done) {
4401
4402                 /*
4403                  * allocate and setup the transaction
4404                  */
4405                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4406                 error = xfs_trans_reserve(tp,
4407                                           resblks,
4408                                           XFS_WRITE_LOG_RES(mp),
4409                                           0,
4410                                           XFS_TRANS_PERM_LOG_RES,
4411                                           XFS_WRITE_LOG_COUNT);
4412
4413                 /*
4414                  * check for running out of space
4415                  */
4416                 if (error) {
4417                         /*
4418                          * Free the transaction structure.
4419                          */
4420                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4421                         xfs_trans_cancel(tp, 0);
4422                         break;
4423                 }
4424                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4425                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4426                                 ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
4427                                 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4428                 if (error)
4429                         goto error1;
4430
4431                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4432                 xfs_trans_ihold(tp, ip);
4433
4434                 /*
4435                  * issue the bunmapi() call to free the blocks
4436                  */
4437                 XFS_BMAP_INIT(&free_list, &firstfsb);
4438                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
4439                                   endoffset_fsb - startoffset_fsb,
4440                                   0, 2, &firstfsb, &free_list, &done);
4441                 if (error) {
4442                         goto error0;
4443                 }
4444
4445                 /*
4446                  * complete the transaction
4447                  */
4448                 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4449                 if (error) {
4450                         goto error0;
4451                 }
4452
4453                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4454                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4455         }
4456
4457  out_unlock_iolock:
4458         if (need_iolock)
4459                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4460         return error;
4461
4462  error0:
4463         xfs_bmap_cancel(&free_list);
4464  error1:
4465         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4466         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4467                     XFS_ILOCK_EXCL);
4468         return error;
4469 }
4470
4471 /*
4472  * xfs_change_file_space()
4473  *      This routine allocates or frees disk space for the given file.
4474  *      The user specified parameters are checked for alignment and size
4475  *      limitations.
4476  *
4477  * RETURNS:
4478  *       0 on success
4479  *      errno on error
4480  *
4481  */
4482 int
4483 xfs_change_file_space(
4484         bhv_desc_t      *bdp,
4485         int             cmd,
4486         xfs_flock64_t   *bf,
4487         xfs_off_t       offset,
4488         cred_t          *credp,
4489         int             attr_flags)
4490 {
4491         int             clrprealloc;
4492         int             error;
4493         xfs_fsize_t     fsize;
4494         xfs_inode_t     *ip;
4495         xfs_mount_t     *mp;
4496         int             setprealloc;
4497         xfs_off_t       startoffset;
4498         xfs_off_t       llen;
4499         xfs_trans_t     *tp;
4500         vattr_t         va;
4501         vnode_t         *vp;
4502
4503         vp = BHV_TO_VNODE(bdp);
4504         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4505
4506         ip = XFS_BHVTOI(bdp);
4507         mp = ip->i_mount;
4508
4509         /*
4510          * must be a regular file and have write permission
4511          */
4512         if (!VN_ISREG(vp))
4513                 return XFS_ERROR(EINVAL);
4514
4515         xfs_ilock(ip, XFS_ILOCK_SHARED);
4516
4517         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4518                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4519                 return error;
4520         }
4521
4522         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4523
4524         switch (bf->l_whence) {
4525         case 0: /*SEEK_SET*/
4526                 break;
4527         case 1: /*SEEK_CUR*/
4528                 bf->l_start += offset;
4529                 break;
4530         case 2: /*SEEK_END*/
4531                 bf->l_start += ip->i_d.di_size;
4532                 break;
4533         default:
4534                 return XFS_ERROR(EINVAL);
4535         }
4536
4537         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4538
4539         if (   (bf->l_start < 0)
4540             || (bf->l_start > XFS_MAXIOFFSET(mp))
4541             || (bf->l_start + llen < 0)
4542             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4543                 return XFS_ERROR(EINVAL);
4544
4545         bf->l_whence = 0;
4546
4547         startoffset = bf->l_start;
4548         fsize = ip->i_d.di_size;
4549
4550         /*
4551          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4552          * file space.
4553          * These calls do NOT zero the data space allocated to the file,
4554          * nor do they change the file size.
4555          *
4556          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4557          * space.
4558          * These calls cause the new file data to be zeroed and the file
4559          * size to be changed.
4560          */
4561         setprealloc = clrprealloc = 0;
4562
4563         switch (cmd) {
4564         case XFS_IOC_RESVSP:
4565         case XFS_IOC_RESVSP64:
4566                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4567                                                                 1, attr_flags);
4568                 if (error)
4569                         return error;
4570                 setprealloc = 1;
4571                 break;
4572
4573         case XFS_IOC_UNRESVSP:
4574         case XFS_IOC_UNRESVSP64:
4575                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4576                                                                 attr_flags)))
4577                         return error;
4578                 break;
4579
4580         case XFS_IOC_ALLOCSP:
4581         case XFS_IOC_ALLOCSP64:
4582         case XFS_IOC_FREESP:
4583         case XFS_IOC_FREESP64:
4584                 if (startoffset > fsize) {
4585                         error = xfs_alloc_file_space(ip, fsize,
4586                                         startoffset - fsize, 0, attr_flags);
4587                         if (error)
4588                                 break;
4589                 }
4590
4591                 va.va_mask = XFS_AT_SIZE;
4592                 va.va_size = startoffset;
4593
4594                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4595
4596                 if (error)
4597                         return error;
4598
4599                 clrprealloc = 1;
4600                 break;
4601
4602         default:
4603                 ASSERT(0);
4604                 return XFS_ERROR(EINVAL);
4605         }
4606
4607         /*
4608          * update the inode timestamp, mode, and prealloc flag bits
4609          */
4610         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4611
4612         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4613                                       0, 0, 0))) {
4614                 /* ASSERT(0); */
4615                 xfs_trans_cancel(tp, 0);
4616                 return error;
4617         }
4618
4619         xfs_ilock(ip, XFS_ILOCK_EXCL);
4620
4621         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4622         xfs_trans_ihold(tp, ip);
4623
4624         if ((attr_flags & ATTR_DMI) == 0) {
4625                 ip->i_d.di_mode &= ~S_ISUID;
4626
4627                 /*
4628                  * Note that we don't have to worry about mandatory
4629                  * file locking being disabled here because we only
4630                  * clear the S_ISGID bit if the Group execute bit is
4631                  * on, but if it was on then mandatory locking wouldn't
4632                  * have been enabled.
4633                  */
4634                 if (ip->i_d.di_mode & S_IXGRP)
4635                         ip->i_d.di_mode &= ~S_ISGID;
4636
4637                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4638         }
4639         if (setprealloc)
4640                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4641         else if (clrprealloc)
4642                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4643
4644         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4645         xfs_trans_set_sync(tp);
4646
4647         error = xfs_trans_commit(tp, 0, NULL);
4648
4649         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4650
4651         return error;
4652 }
4653
4654 vnodeops_t xfs_vnodeops = {
4655         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4656         .vop_open               = xfs_open,
4657         .vop_read               = xfs_read,
4658 #ifdef HAVE_SENDFILE
4659         .vop_sendfile           = xfs_sendfile,
4660 #endif
4661         .vop_write              = xfs_write,
4662         .vop_ioctl              = xfs_ioctl,
4663         .vop_getattr            = xfs_getattr,
4664         .vop_setattr            = xfs_setattr,
4665         .vop_access             = xfs_access,
4666         .vop_lookup             = xfs_lookup,
4667         .vop_create             = xfs_create,
4668         .vop_remove             = xfs_remove,
4669         .vop_link               = xfs_link,
4670         .vop_rename             = xfs_rename,
4671         .vop_mkdir              = xfs_mkdir,
4672         .vop_rmdir              = xfs_rmdir,
4673         .vop_readdir            = xfs_readdir,
4674         .vop_symlink            = xfs_symlink,
4675         .vop_readlink           = xfs_readlink,
4676         .vop_fsync              = xfs_fsync,
4677         .vop_inactive           = xfs_inactive,
4678         .vop_fid2               = xfs_fid2,
4679         .vop_rwlock             = xfs_rwlock,
4680         .vop_rwunlock           = xfs_rwunlock,
4681         .vop_bmap               = xfs_bmap,
4682         .vop_reclaim            = xfs_reclaim,
4683         .vop_attr_get           = xfs_attr_get,
4684         .vop_attr_set           = xfs_attr_set,
4685         .vop_attr_remove        = xfs_attr_remove,
4686         .vop_attr_list          = xfs_attr_list,
4687         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4688         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4689         .vop_tosspages          = fs_tosspages,
4690         .vop_flushinval_pages   = fs_flushinval_pages,
4691         .vop_flush_pages        = fs_flush_pages,
4692         .vop_release            = xfs_release,
4693         .vop_iflush             = xfs_inode_flush,
4694 };