err.no Git - linux-2.6/blob - fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include "xfs.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_types.h"
  22 #include "xfs_bit.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_sb.h"
  27 #include "xfs_ag.h"
  28 #include "xfs_dir2.h"
  29 #include "xfs_dmapi.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_bmap_btree.h"
  33 #include "xfs_alloc_btree.h"
  34 #include "xfs_ialloc_btree.h"
  35 #include "xfs_dir2_sf.h"
  36 #include "xfs_attr_sf.h"
  37 #include "xfs_dinode.h"
  38 #include "xfs_inode.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_itable.h"
  41 #include "xfs_btree.h"
  42 #include "xfs_ialloc.h"
  43 #include "xfs_alloc.h"
  44 #include "xfs_bmap.h"
  45 #include "xfs_attr.h"
  46 #include "xfs_rw.h"
  47 #include "xfs_error.h"
  48 #include "xfs_quota.h"
  49 #include "xfs_utils.h"
  50 #include "xfs_rtalloc.h"
  51 #include "xfs_trans_space.h"
  52 #include "xfs_log_priv.h"
  53 #include "xfs_filestream.h"
  54 #include "xfs_vnodeops.h"
  55
  56 int
  57 xfs_open(
  58         xfs_inode_t     *ip)
  59 {
  60         int             mode;
  61
  62         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  63                 return XFS_ERROR(EIO);
  64
  65         /*
  66          * If it's a directory with any blocks, read-ahead block 0
  67          * as we're almost certain to have the next operation be a read there.
  68          */
  69         if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
  70                 mode = xfs_ilock_map_shared(ip);
  71                 if (ip->i_d.di_nextents > 0)
  72                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  73                 xfs_iunlock(ip, mode);
  74         }
  75         return 0;
  76 }
  77
  78 /*
  79  * xfs_getattr
  80  */
  81 int
  82 xfs_getattr(
  83         xfs_inode_t     *ip,
  84         bhv_vattr_t     *vap,
  85         int             flags)
  86 {
  87         bhv_vnode_t     *vp = XFS_ITOV(ip);
  88         xfs_mount_t     *mp = ip->i_mount;
  89
  90         xfs_itrace_entry(ip);
  91
  92         if (XFS_FORCED_SHUTDOWN(mp))
  93                 return XFS_ERROR(EIO);
  94
  95         if (!(flags & ATTR_LAZY))
  96                 xfs_ilock(ip, XFS_ILOCK_SHARED);
  97
  98         vap->va_size = XFS_ISIZE(ip);
  99         if (vap->va_mask == XFS_AT_SIZE)
 100                 goto all_done;
 101
 102         vap->va_nblocks =
 103                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 104         vap->va_nodeid = ip->i_ino;
 105 #if XFS_BIG_INUMS
 106         vap->va_nodeid += mp->m_inoadd;
 107 #endif
 108         vap->va_nlink = ip->i_d.di_nlink;
 109
 110         /*
 111          * Quick exit for non-stat callers
 112          */
 113         if ((vap->va_mask &
 114             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 115               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 116                 goto all_done;
 117
 118         /*
 119          * Copy from in-core inode.
 120          */
 121         vap->va_mode = ip->i_d.di_mode;
 122         vap->va_uid = ip->i_d.di_uid;
 123         vap->va_gid = ip->i_d.di_gid;
 124         vap->va_projid = ip->i_d.di_projid;
 125
 126         /*
 127          * Check vnode type block/char vs. everything else.
 128          */
 129         switch (ip->i_d.di_mode & S_IFMT) {
 130         case S_IFBLK:
 131         case S_IFCHR:
 132                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 133                 vap->va_blocksize = BLKDEV_IOSIZE;
 134                 break;
 135         default:
 136                 vap->va_rdev = 0;
 137
 138                 if (!(XFS_IS_REALTIME_INODE(ip))) {
 139                         vap->va_blocksize = xfs_preferred_iosize(mp);
 140                 } else {
 141
 142                         /*
 143                          * If the file blocks are being allocated from a
 144                          * realtime partition, then return the inode's
 145                          * realtime extent size or the realtime volume's
 146                          * extent size.
 147                          */
 148                         vap->va_blocksize =
 149                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
 150                 }
 151                 break;
 152         }
 153
 154         vn_atime_to_timespec(vp, &vap->va_atime);
 155         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 156         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 157         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 158         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 159
 160         /*
 161          * Exit for stat callers.  See if any of the rest of the fields
 162          * to be filled in are needed.
 163          */
 164         if ((vap->va_mask &
 165              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 166               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 167                 goto all_done;
 168
 169         /*
 170          * Convert di_flags to xflags.
 171          */
 172         vap->va_xflags = xfs_ip2xflags(ip);
 173
 174         /*
 175          * Exit for inode revalidate.  See if any of the rest of
 176          * the fields to be filled in are needed.
 177          */
 178         if ((vap->va_mask &
 179              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 180               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 181                 goto all_done;
 182
 183         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 184         vap->va_nextents =
 185                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 186                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 187                         ip->i_d.di_nextents;
 188         if (ip->i_afp)
 189                 vap->va_anextents =
 190                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 191                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 192                                  ip->i_d.di_anextents;
 193         else
 194                 vap->va_anextents = 0;
 195         vap->va_gen = ip->i_d.di_gen;
 196
 197  all_done:
 198         if (!(flags & ATTR_LAZY))
 199                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 200         return 0;
 201 }
 202
 203
 204 /*
 205  * xfs_setattr
 206  */
 207 int
 208 xfs_setattr(
 209         xfs_inode_t             *ip,
 210         bhv_vattr_t             *vap,
 211         int                     flags,
 212         cred_t                  *credp)
 213 {
 214         xfs_mount_t             *mp = ip->i_mount;
 215         xfs_trans_t             *tp;
 216         int                     mask;
 217         int                     code;
 218         uint                    lock_flags;
 219         uint                    commit_flags=0;
 220         uid_t                   uid=0, iuid=0;
 221         gid_t                   gid=0, igid=0;
 222         int                     timeflags = 0;
 223         xfs_prid_t              projid=0, iprojid=0;
 224         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 225         int                     file_owner;
 226         int                     need_iolock = 1;
 227
 228         xfs_itrace_entry(ip);
 229
 230         if (mp->m_flags & XFS_MOUNT_RDONLY)
 231                 return XFS_ERROR(EROFS);
 232
 233         /*
 234          * Cannot set certain attributes.
 235          */
 236         mask = vap->va_mask;
 237         if (mask & XFS_AT_NOSET) {
 238                 return XFS_ERROR(EINVAL);
 239         }
 240
 241         if (XFS_FORCED_SHUTDOWN(mp))
 242                 return XFS_ERROR(EIO);
 243
 244         /*
 245          * Timestamps do not need to be logged and hence do not
 246          * need to be done within a transaction.
 247          */
 248         if (mask & XFS_AT_UPDTIMES) {
 249                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 250                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 251                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 252                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 253                 xfs_ichgtime(ip, timeflags);
 254                 return 0;
 255         }
 256
 257         olddquot1 = olddquot2 = NULL;
 258         udqp = gdqp = NULL;
 259
 260         /*
 261          * If disk quotas is on, we make sure that the dquots do exist on disk,
 262          * before we start any other transactions. Trying to do this later
 263          * is messy. We don't care to take a readlock to look at the ids
 264          * in inode here, because we can't hold it across the trans_reserve.
 265          * If the IDs do change before we take the ilock, we're covered
 266          * because the i_*dquot fields will get updated anyway.
 267          */
 268         if (XFS_IS_QUOTA_ON(mp) &&
 269             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 270                 uint    qflags = 0;
 271
 272                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 273                         uid = vap->va_uid;
 274                         qflags |= XFS_QMOPT_UQUOTA;
 275                 } else {
 276                         uid = ip->i_d.di_uid;
 277                 }
 278                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 279                         gid = vap->va_gid;
 280                         qflags |= XFS_QMOPT_GQUOTA;
 281                 }  else {
 282                         gid = ip->i_d.di_gid;
 283                 }
 284                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 285                         projid = vap->va_projid;
 286                         qflags |= XFS_QMOPT_PQUOTA;
 287                 }  else {
 288                         projid = ip->i_d.di_projid;
 289                 }
 290                 /*
 291                  * We take a reference when we initialize udqp and gdqp,
 292                  * so it is important that we never blindly double trip on
 293                  * the same variable. See xfs_create() for an example.
 294                  */
 295                 ASSERT(udqp == NULL);
 296                 ASSERT(gdqp == NULL);
 297                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 298                                          &udqp, &gdqp);
 299                 if (code)
 300                         return code;
 301         }
 302
 303         /*
 304          * For the other attributes, we acquire the inode lock and
 305          * first do an error checking pass.
 306          */
 307         tp = NULL;
 308         lock_flags = XFS_ILOCK_EXCL;
 309         if (flags & ATTR_NOLOCK)
 310                 need_iolock = 0;
 311         if (!(mask & XFS_AT_SIZE)) {
 312                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 313                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 314                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 315                         commit_flags = 0;
 316                         if ((code = xfs_trans_reserve(tp, 0,
 317                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 318                                                      0, 0))) {
 319                                 lock_flags = 0;
 320                                 goto error_return;
 321                         }
 322                 }
 323         } else {
 324                 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
 325                     !(flags & ATTR_DMI)) {
 326                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 327                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
 328                                 vap->va_size, 0, dmflags, NULL);
 329                         if (code) {
 330                                 lock_flags = 0;
 331                                 goto error_return;
 332                         }
 333                 }
 334                 if (need_iolock)
 335                         lock_flags |= XFS_IOLOCK_EXCL;
 336         }
 337
 338         xfs_ilock(ip, lock_flags);
 339
 340         /* boolean: are we the file owner? */
 341         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 342
 343         /*
 344          * Change various properties of a file.
 345          * Only the owner or users with CAP_FOWNER
 346          * capability may do these things.
 347          */
 348         if (mask &
 349             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 350              XFS_AT_GID|XFS_AT_PROJID)) {
 351                 /*
 352                  * CAP_FOWNER overrides the following restrictions:
 353                  *
 354                  * The user ID of the calling process must be equal
 355                  * to the file owner ID, except in cases where the
 356                  * CAP_FSETID capability is applicable.
 357                  */
 358                 if (!file_owner && !capable(CAP_FOWNER)) {
 359                         code = XFS_ERROR(EPERM);
 360                         goto error_return;
 361                 }
 362
 363                 /*
 364                  * CAP_FSETID overrides the following restrictions:
 365                  *
 366                  * The effective user ID of the calling process shall match
 367                  * the file owner when setting the set-user-ID and
 368                  * set-group-ID bits on that file.
 369                  *
 370                  * The effective group ID or one of the supplementary group
 371                  * IDs of the calling process shall match the group owner of
 372                  * the file when setting the set-group-ID bit on that file
 373                  */
 374                 if (mask & XFS_AT_MODE) {
 375                         mode_t m = 0;
 376
 377                         if ((vap->va_mode & S_ISUID) && !file_owner)
 378                                 m |= S_ISUID;
 379                         if ((vap->va_mode & S_ISGID) &&
 380                             !in_group_p((gid_t)ip->i_d.di_gid))
 381                                 m |= S_ISGID;
 382 #if 0
 383                         /* Linux allows this, Irix doesn't. */
 384                         if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
 385                                 m |= S_ISVTX;
 386 #endif
 387                         if (m && !capable(CAP_FSETID))
 388                                 vap->va_mode &= ~m;
 389                 }
 390         }
 391
 392         /*
 393          * Change file ownership.  Must be the owner or privileged.
 394          * If the system was configured with the "restricted_chown"
 395          * option, the owner is not permitted to give away the file,
 396          * and can change the group id only to a group of which he
 397          * or she is a member.
 398          */
 399         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 400                 /*
 401                  * These IDs could have changed since we last looked at them.
 402                  * But, we're assured that if the ownership did change
 403                  * while we didn't have the inode locked, inode's dquot(s)
 404                  * would have changed also.
 405                  */
 406                 iuid = ip->i_d.di_uid;
 407                 iprojid = ip->i_d.di_projid;
 408                 igid = ip->i_d.di_gid;
 409                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 410                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 411                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 412                          iprojid;
 413
 414                 /*
 415                  * CAP_CHOWN overrides the following restrictions:
 416                  *
 417                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 418                  * shall override the restriction that a process cannot
 419                  * change the user ID of a file it owns and the restriction
 420                  * that the group ID supplied to the chown() function
 421                  * shall be equal to either the group ID or one of the
 422                  * supplementary group IDs of the calling process.
 423                  */
 424                 if (restricted_chown &&
 425                     (iuid != uid || (igid != gid &&
 426                                      !in_group_p((gid_t)gid))) &&
 427                     !capable(CAP_CHOWN)) {
 428                         code = XFS_ERROR(EPERM);
 429                         goto error_return;
 430                 }
 431                 /*
 432                  * Do a quota reservation only if uid/projid/gid is actually
 433                  * going to change.
 434                  */
 435                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 436                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 437                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 438                         ASSERT(tp);
 439                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 440                                                 capable(CAP_FOWNER) ?
 441                                                 XFS_QMOPT_FORCE_RES : 0);
 442                         if (code)       /* out of quota */
 443                                 goto error_return;
 444                 }
 445         }
 446
 447         /*
 448          * Truncate file.  Must have write permission and not be a directory.
 449          */
 450         if (mask & XFS_AT_SIZE) {
 451                 /* Short circuit the truncate case for zero length files */
 452                 if ((vap->va_size == 0) &&
 453                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
 454                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 455                         lock_flags &= ~XFS_ILOCK_EXCL;
 456                         if (mask & XFS_AT_CTIME)
 457                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 458                         code = 0;
 459                         goto error_return;
 460                 }
 461
 462                 if (S_ISDIR(ip->i_d.di_mode)) {
 463                         code = XFS_ERROR(EISDIR);
 464                         goto error_return;
 465                 } else if (!S_ISREG(ip->i_d.di_mode)) {
 466                         code = XFS_ERROR(EINVAL);
 467                         goto error_return;
 468                 }
 469                 /*
 470                  * Make sure that the dquots are attached to the inode.
 471                  */
 472                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 473                         goto error_return;
 474         }
 475
 476         /*
 477          * Change file access or modified times.
 478          */
 479         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 480                 if (!file_owner) {
 481                         if ((flags & ATTR_UTIME) &&
 482                             !capable(CAP_FOWNER)) {
 483                                 code = XFS_ERROR(EPERM);
 484                                 goto error_return;
 485                         }
 486                 }
 487         }
 488
 489         /*
 490          * Change extent size or realtime flag.
 491          */
 492         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 493                 /*
 494                  * Can't change extent size if any extents are allocated.
 495                  */
 496                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 497                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 498                      vap->va_extsize) ) {
 499                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 500                         goto error_return;
 501                 }
 502
 503                 /*
 504                  * Can't change realtime flag if any extents are allocated.
 505                  */
 506                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 507                     (mask & XFS_AT_XFLAGS) &&
 508                     (XFS_IS_REALTIME_INODE(ip)) !=
 509                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 510                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 511                         goto error_return;
 512                 }
 513                 /*
 514                  * Extent size must be a multiple of the appropriate block
 515                  * size, if set at all.
 516                  */
 517                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 518                         xfs_extlen_t    size;
 519
 520                         if (XFS_IS_REALTIME_INODE(ip) ||
 521                             ((mask & XFS_AT_XFLAGS) &&
 522                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 523                                 size = mp->m_sb.sb_rextsize <<
 524                                        mp->m_sb.sb_blocklog;
 525                         } else {
 526                                 size = mp->m_sb.sb_blocksize;
 527                         }
 528                         if (vap->va_extsize % size) {
 529                                 code = XFS_ERROR(EINVAL);
 530                                 goto error_return;
 531                         }
 532                 }
 533                 /*
 534                  * If realtime flag is set then must have realtime data.
 535                  */
 536                 if ((mask & XFS_AT_XFLAGS) &&
 537                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 538                         if ((mp->m_sb.sb_rblocks == 0) ||
 539                             (mp->m_sb.sb_rextsize == 0) ||
 540                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 541                                 code = XFS_ERROR(EINVAL);
 542                                 goto error_return;
 543                         }
 544                 }
 545
 546                 /*
 547                  * Can't modify an immutable/append-only file unless
 548                  * we have appropriate permission.
 549                  */
 550                 if ((mask & XFS_AT_XFLAGS) &&
 551                     (ip->i_d.di_flags &
 552                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 553                      (vap->va_xflags &
 554                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 555                     !capable(CAP_LINUX_IMMUTABLE)) {
 556                         code = XFS_ERROR(EPERM);
 557                         goto error_return;
 558                 }
 559         }
 560
 561         /*
 562          * Now we can make the changes.  Before we join the inode
 563          * to the transaction, if XFS_AT_SIZE is set then take care of
 564          * the part of the truncation that must be done without the
 565          * inode lock.  This needs to be done before joining the inode
 566          * to the transaction, because the inode cannot be unlocked
 567          * once it is a part of the transaction.
 568          */
 569         if (mask & XFS_AT_SIZE) {
 570                 code = 0;
 571                 if ((vap->va_size > ip->i_size) &&
 572                     (flags & ATTR_NOSIZETOK) == 0) {
 573                         code = xfs_igrow_start(ip, vap->va_size, credp);
 574                 }
 575                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 576
 577                 /*
 578                  * We are going to log the inode size change in this
 579                  * transaction so any previous writes that are beyond the on
 580                  * disk EOF and the new EOF that have not been written out need
 581                  * to be written here. If we do not write the data out, we
 582                  * expose ourselves to the null files problem.
 583                  *
 584                  * Only flush from the on disk size to the smaller of the in
 585                  * memory file size or the new size as that's the range we
 586                  * really care about here and prevents waiting for other data
 587                  * not within the range we care about here.
 588                  */
 589                 if (!code &&
 590                     (ip->i_size != ip->i_d.di_size) &&
 591                     (vap->va_size > ip->i_d.di_size)) {
 592                         code = xfs_flush_pages(ip,
 593                                         ip->i_d.di_size, vap->va_size,
 594                                         XFS_B_ASYNC, FI_NONE);
 595                 }
 596
 597                 /* wait for all I/O to complete */
 598                 vn_iowait(ip);
 599
 600                 if (!code)
 601                         code = xfs_itruncate_data(ip, vap->va_size);
 602                 if (code) {
 603                         ASSERT(tp == NULL);
 604                         lock_flags &= ~XFS_ILOCK_EXCL;
 605                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 606                         goto error_return;
 607                 }
 608                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 609                 if ((code = xfs_trans_reserve(tp, 0,
 610                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 611                                              XFS_TRANS_PERM_LOG_RES,
 612                                              XFS_ITRUNCATE_LOG_COUNT))) {
 613                         xfs_trans_cancel(tp, 0);
 614                         if (need_iolock)
 615                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 616                         return code;
 617                 }
 618                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 619                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 620         }
 621
 622         if (tp) {
 623                 xfs_trans_ijoin(tp, ip, lock_flags);
 624                 xfs_trans_ihold(tp, ip);
 625         }
 626
 627         /*
 628          * Truncate file.  Must have write permission and not be a directory.
 629          */
 630         if (mask & XFS_AT_SIZE) {
 631                 /*
 632                  * Only change the c/mtime if we are changing the size
 633                  * or we are explicitly asked to change it. This handles
 634                  * the semantic difference between truncate() and ftruncate()
 635                  * as implemented in the VFS.
 636                  */
 637                 if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
 638                         timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 639
 640                 if (vap->va_size > ip->i_size) {
 641                         xfs_igrow_finish(tp, ip, vap->va_size,
 642                             !(flags & ATTR_DMI));
 643                 } else if ((vap->va_size <= ip->i_size) ||
 644                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 645                         /*
 646                          * signal a sync transaction unless
 647                          * we're truncating an already unlinked
 648                          * file on a wsync filesystem
 649                          */
 650                         code = xfs_itruncate_finish(&tp, ip,
 651                                             (xfs_fsize_t)vap->va_size,
 652                                             XFS_DATA_FORK,
 653                                             ((ip->i_d.di_nlink != 0 ||
 654                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 655                                              ? 1 : 0));
 656                         if (code)
 657                                 goto abort_return;
 658                         /*
 659                          * Truncated "down", so we're removing references
 660                          * to old data here - if we now delay flushing for
 661                          * a long time, we expose ourselves unduly to the
 662                          * notorious NULL files problem.  So, we mark this
 663                          * vnode and flush it when the file is closed, and
 664                          * do not wait the usual (long) time for writeout.
 665                          */
 666                         xfs_iflags_set(ip, XFS_ITRUNCATED);
 667                 }
 668         }
 669
 670         /*
 671          * Change file access modes.
 672          */
 673         if (mask & XFS_AT_MODE) {
 674                 ip->i_d.di_mode &= S_IFMT;
 675                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 676
 677                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 678                 timeflags |= XFS_ICHGTIME_CHG;
 679         }
 680
 681         /*
 682          * Change file ownership.  Must be the owner or privileged.
 683          * If the system was configured with the "restricted_chown"
 684          * option, the owner is not permitted to give away the file,
 685          * and can change the group id only to a group of which he
 686          * or she is a member.
 687          */
 688         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
 689                 /*
 690                  * CAP_FSETID overrides the following restrictions:
 691                  *
 692                  * The set-user-ID and set-group-ID bits of a file will be
 693                  * cleared upon successful return from chown()
 694                  */
 695                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 696                     !capable(CAP_FSETID)) {
 697                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 698                 }
 699
 700                 /*
 701                  * Change the ownerships and register quota modifications
 702                  * in the transaction.
 703                  */
 704                 if (iuid != uid) {
 705                         if (XFS_IS_UQUOTA_ON(mp)) {
 706                                 ASSERT(mask & XFS_AT_UID);
 707                                 ASSERT(udqp);
 708                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 709                                                         &ip->i_udquot, udqp);
 710                         }
 711                         ip->i_d.di_uid = uid;
 712                 }
 713                 if (igid != gid) {
 714                         if (XFS_IS_GQUOTA_ON(mp)) {
 715                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 716                                 ASSERT(mask & XFS_AT_GID);
 717                                 ASSERT(gdqp);
 718                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 719                                                         &ip->i_gdquot, gdqp);
 720                         }
 721                         ip->i_d.di_gid = gid;
 722                 }
 723                 if (iprojid != projid) {
 724                         if (XFS_IS_PQUOTA_ON(mp)) {
 725                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 726                                 ASSERT(mask & XFS_AT_PROJID);
 727                                 ASSERT(gdqp);
 728                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 729                                                         &ip->i_gdquot, gdqp);
 730                         }
 731                         ip->i_d.di_projid = projid;
 732                         /*
 733                          * We may have to rev the inode as well as
 734                          * the superblock version number since projids didn't
 735                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 736                          */
 737                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 738                                 xfs_bump_ino_vers2(tp, ip);
 739                 }
 740
 741                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 742                 timeflags |= XFS_ICHGTIME_CHG;
 743         }
 744
 745
 746         /*
 747          * Change file access or modified times.
 748          */
 749         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 750                 if (mask & XFS_AT_ATIME) {
 751                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 752                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 753                         ip->i_update_core = 1;
 754                         timeflags &= ~XFS_ICHGTIME_ACC;
 755                 }
 756                 if (mask & XFS_AT_MTIME) {
 757                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 758                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 759                         timeflags &= ~XFS_ICHGTIME_MOD;
 760                         timeflags |= XFS_ICHGTIME_CHG;
 761                 }
 762                 if (tp && (flags & ATTR_UTIME))
 763                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 764         }
 765
 766         /*
 767          * Change XFS-added attributes.
 768          */
 769         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 770                 if (mask & XFS_AT_EXTSIZE) {
 771                         /*
 772                          * Converting bytes to fs blocks.
 773                          */
 774                         ip->i_d.di_extsize = vap->va_extsize >>
 775                                 mp->m_sb.sb_blocklog;
 776                 }
 777                 if (mask & XFS_AT_XFLAGS) {
 778                         uint    di_flags;
 779
 780                         /* can't set PREALLOC this way, just preserve it */
 781                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 782                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 783                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 784                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 785                                 di_flags |= XFS_DIFLAG_APPEND;
 786                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 787                                 di_flags |= XFS_DIFLAG_SYNC;
 788                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 789                                 di_flags |= XFS_DIFLAG_NOATIME;
 790                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 791                                 di_flags |= XFS_DIFLAG_NODUMP;
 792                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 793                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 794                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
 795                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 796                         if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
 797                                 di_flags |= XFS_DIFLAG_FILESTREAM;
 798                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 799                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 800                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 801                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 802                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 803                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 804                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 805                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 806                                 if (vap->va_xflags & XFS_XFLAG_REALTIME)
 807                                         di_flags |= XFS_DIFLAG_REALTIME;
 808                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 809                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 810                         }
 811                         ip->i_d.di_flags = di_flags;
 812                 }
 813                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 814                 timeflags |= XFS_ICHGTIME_CHG;
 815         }
 816
 817         /*
 818          * Change file inode change time only if XFS_AT_CTIME set
 819          * AND we have been called by a DMI function.
 820          */
 821
 822         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 823                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 824                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 825                 ip->i_update_core = 1;
 826                 timeflags &= ~XFS_ICHGTIME_CHG;
 827         }
 828
 829         /*
 830          * Send out timestamp changes that need to be set to the
 831          * current time.  Not done when called by a DMI function.
 832          */
 833         if (timeflags && !(flags & ATTR_DMI))
 834                 xfs_ichgtime(ip, timeflags);
 835
 836         XFS_STATS_INC(xs_ig_attrchg);
 837
 838         /*
 839          * If this is a synchronous mount, make sure that the
 840          * transaction goes to disk before returning to the user.
 841          * This is slightly sub-optimal in that truncates require
 842          * two sync transactions instead of one for wsync filesystems.
 843          * One for the truncate and one for the timestamps since we
 844          * don't want to change the timestamps unless we're sure the
 845          * truncate worked.  Truncates are less than 1% of the laddis
 846          * mix so this probably isn't worth the trouble to optimize.
 847          */
 848         code = 0;
 849         if (tp) {
 850                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 851                         xfs_trans_set_sync(tp);
 852
 853                 code = xfs_trans_commit(tp, commit_flags);
 854         }
 855
 856         xfs_iunlock(ip, lock_flags);
 857
 858         /*
 859          * Release any dquot(s) the inode had kept before chown.
 860          */
 861         XFS_QM_DQRELE(mp, olddquot1);
 862         XFS_QM_DQRELE(mp, olddquot2);
 863         XFS_QM_DQRELE(mp, udqp);
 864         XFS_QM_DQRELE(mp, gdqp);
 865
 866         if (code) {
 867                 return code;
 868         }
 869
 870         if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
 871             !(flags & ATTR_DMI)) {
 872                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
 873                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 874                                         0, 0, AT_DELAY_FLAG(flags));
 875         }
 876         return 0;
 877
 878  abort_return:
 879         commit_flags |= XFS_TRANS_ABORT;
 880         /* FALLTHROUGH */
 881  error_return:
 882         XFS_QM_DQRELE(mp, udqp);
 883         XFS_QM_DQRELE(mp, gdqp);
 884         if (tp) {
 885                 xfs_trans_cancel(tp, commit_flags);
 886         }
 887         if (lock_flags != 0) {
 888                 xfs_iunlock(ip, lock_flags);
 889         }
 890         return code;
 891 }
 892
 893 /*
 894  * The maximum pathlen is 1024 bytes. Since the minimum file system
 895  * blocksize is 512 bytes, we can get a max of 2 extents back from
 896  * bmapi.
 897  */
 898 #define SYMLINK_MAPS 2
 899
 900 STATIC int
 901 xfs_readlink_bmap(
 902         xfs_inode_t     *ip,
 903         char            *link)
 904 {
 905         xfs_mount_t     *mp = ip->i_mount;
 906         int             pathlen = ip->i_d.di_size;
 907         int             nmaps = SYMLINK_MAPS;
 908         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 909         xfs_daddr_t     d;
 910         int             byte_cnt;
 911         int             n;
 912         xfs_buf_t       *bp;
 913         int             error = 0;
 914
 915         error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
 916                         mval, &nmaps, NULL, NULL);
 917         if (error)
 918                 goto out;
 919
 920         for (n = 0; n < nmaps; n++) {
 921                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 922                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 923
 924                 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
 925                 error = XFS_BUF_GETERROR(bp);
 926                 if (error) {
 927                         xfs_ioerror_alert("xfs_readlink",
 928                                   ip->i_mount, bp, XFS_BUF_ADDR(bp));
 929                         xfs_buf_relse(bp);
 930                         goto out;
 931                 }
 932                 if (pathlen < byte_cnt)
 933                         byte_cnt = pathlen;
 934                 pathlen -= byte_cnt;
 935
 936                 memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
 937                 xfs_buf_relse(bp);
 938         }
 939
 940         link[ip->i_d.di_size] = '\0';
 941         error = 0;
 942
 943  out:
 944         return error;
 945 }
 946
 947 int
 948 xfs_readlink(
 949         xfs_inode_t     *ip,
 950         char            *link)
 951 {
 952         xfs_mount_t     *mp = ip->i_mount;
 953         int             pathlen;
 954         int             error = 0;
 955
 956         xfs_itrace_entry(ip);
 957
 958         if (XFS_FORCED_SHUTDOWN(mp))
 959                 return XFS_ERROR(EIO);
 960
 961         xfs_ilock(ip, XFS_ILOCK_SHARED);
 962
 963         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 964         ASSERT(ip->i_d.di_size <= MAXPATHLEN);
 965
 966         pathlen = ip->i_d.di_size;
 967         if (!pathlen)
 968                 goto out;
 969
 970         if (ip->i_df.if_flags & XFS_IFINLINE) {
 971                 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
 972                 link[pathlen] = '\0';
 973         } else {
 974                 error = xfs_readlink_bmap(ip, link);
 975         }
 976
 977  out:
 978         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 979         return error;
 980 }
 981
 982 /*
 983  * xfs_fsync
 984  *
 985  * This is called to sync the inode and its data out to disk.
 986  * We need to hold the I/O lock while flushing the data, and
 987  * the inode lock while flushing the inode.  The inode lock CANNOT
 988  * be held while flushing the data, so acquire after we're done
 989  * with that.
 990  */
 991 int
 992 xfs_fsync(
 993         xfs_inode_t     *ip,
 994         int             flag,
 995         xfs_off_t       start,
 996         xfs_off_t       stop)
 997 {
 998         xfs_trans_t     *tp;
 999         int             error;
1000         int             log_flushed = 0, changed = 1;
1001
1002         xfs_itrace_entry(ip);
1003
1004         ASSERT(start >= 0 && stop >= -1);
1005
1006         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1007                 return XFS_ERROR(EIO);
1008
1009         if (flag & FSYNC_DATA)
1010                 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
1011
1012         /*
1013          * We always need to make sure that the required inode state
1014          * is safe on disk.  The vnode might be clean but because
1015          * of committed transactions that haven't hit the disk yet.
1016          * Likewise, there could be unflushed non-transactional
1017          * changes to the inode core that have to go to disk.
1018          *
1019          * The following code depends on one assumption:  that
1020          * any transaction that changes an inode logs the core
1021          * because it has to change some field in the inode core
1022          * (typically nextents or nblocks).  That assumption
1023          * implies that any transactions against an inode will
1024          * catch any non-transactional updates.  If inode-altering
1025          * transactions exist that violate this assumption, the
1026          * code breaks.  Right now, it figures that if the involved
1027          * update_* field is clear and the inode is unpinned, the
1028          * inode is clean.  Either it's been flushed or it's been
1029          * committed and the commit has hit the disk unpinning the inode.
1030          * (Note that xfs_inode_item_format() called at commit clears
1031          * the update_* fields.)
1032          */
1033         xfs_ilock(ip, XFS_ILOCK_SHARED);
1034
1035         /* If we are flushing data then we care about update_size
1036          * being set, otherwise we care about update_core
1037          */
1038         if ((flag & FSYNC_DATA) ?
1039                         (ip->i_update_size == 0) :
1040                         (ip->i_update_core == 0)) {
1041                 /*
1042                  * Timestamps/size haven't changed since last inode
1043                  * flush or inode transaction commit.  That means
1044                  * either nothing got written or a transaction
1045                  * committed which caught the updates.  If the
1046                  * latter happened and the transaction hasn't
1047                  * hit the disk yet, the inode will be still
1048                  * be pinned.  If it is, force the log.
1049                  */
1050
1051                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1052
1053                 if (xfs_ipincount(ip)) {
1054                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1055                                       XFS_LOG_FORCE |
1056                                       ((flag & FSYNC_WAIT)
1057                                        ? XFS_LOG_SYNC : 0),
1058                                       &log_flushed);
1059                 } else {
1060                         /*
1061                          * If the inode is not pinned and nothing
1062                          * has changed we don't need to flush the
1063                          * cache.
1064                          */
1065                         changed = 0;
1066                 }
1067                 error = 0;
1068         } else  {
1069                 /*
1070                  * Kick off a transaction to log the inode
1071                  * core to get the updates.  Make it
1072                  * sync if FSYNC_WAIT is passed in (which
1073                  * is done by everybody but specfs).  The
1074                  * sync transaction will also force the log.
1075                  */
1076                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1077                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1078                 if ((error = xfs_trans_reserve(tp, 0,
1079                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1080                                 0, 0, 0)))  {
1081                         xfs_trans_cancel(tp, 0);
1082                         return error;
1083                 }
1084                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1085
1086                 /*
1087                  * Note - it's possible that we might have pushed
1088                  * ourselves out of the way during trans_reserve
1089                  * which would flush the inode.  But there's no
1090                  * guarantee that the inode buffer has actually
1091                  * gone out yet (it's delwri).  Plus the buffer
1092                  * could be pinned anyway if it's part of an
1093                  * inode in another recent transaction.  So we
1094                  * play it safe and fire off the transaction anyway.
1095                  */
1096                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1097                 xfs_trans_ihold(tp, ip);
1098                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1099                 if (flag & FSYNC_WAIT)
1100                         xfs_trans_set_sync(tp);
1101                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1102
1103                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1104         }
1105
1106         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1107                 /*
1108                  * If the log write didn't issue an ordered tag we need
1109                  * to flush the disk cache for the data device now.
1110                  */
1111                 if (!log_flushed)
1112                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1113
1114                 /*
1115                  * If this inode is on the RT dev we need to flush that
1116                  * cache as well.
1117                  */
1118                 if (XFS_IS_REALTIME_INODE(ip))
1119                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1120         }
1121
1122         return error;
1123 }
1124
1125 /*
1126  * This is called by xfs_inactive to free any blocks beyond eof
1127  * when the link count isn't zero and by xfs_dm_punch_hole() when
1128  * punching a hole to EOF.
1129  */
1130 int
1131 xfs_free_eofblocks(
1132         xfs_mount_t     *mp,
1133         xfs_inode_t     *ip,
1134         int             flags)
1135 {
1136         xfs_trans_t     *tp;
1137         int             error;
1138         xfs_fileoff_t   end_fsb;
1139         xfs_fileoff_t   last_fsb;
1140         xfs_filblks_t   map_len;
1141         int             nimaps;
1142         xfs_bmbt_irec_t imap;
1143         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1144
1145         /*
1146          * Figure out if there are any blocks beyond the end
1147          * of the file.  If not, then there is nothing to do.
1148          */
1149         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1150         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1151         map_len = last_fsb - end_fsb;
1152         if (map_len <= 0)
1153                 return 0;
1154
1155         nimaps = 1;
1156         xfs_ilock(ip, XFS_ILOCK_SHARED);
1157         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1158                           NULL, 0, &imap, &nimaps, NULL, NULL);
1159         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1160
1161         if (!error && (nimaps != 0) &&
1162             (imap.br_startblock != HOLESTARTBLOCK ||
1163              ip->i_delayed_blks)) {
1164                 /*
1165                  * Attach the dquots to the inode up front.
1166                  */
1167                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1168                         return error;
1169
1170                 /*
1171                  * There are blocks after the end of file.
1172                  * Free them up now by truncating the file to
1173                  * its current size.
1174                  */
1175                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1176
1177                 /*
1178                  * Do the xfs_itruncate_start() call before
1179                  * reserving any log space because
1180                  * itruncate_start will call into the buffer
1181                  * cache and we can't
1182                  * do that within a transaction.
1183                  */
1184                 if (use_iolock)
1185                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1186                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1187                                     ip->i_size);
1188                 if (error) {
1189                         xfs_trans_cancel(tp, 0);
1190                         if (use_iolock)
1191                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1192                         return error;
1193                 }
1194
1195                 error = xfs_trans_reserve(tp, 0,
1196                                           XFS_ITRUNCATE_LOG_RES(mp),
1197                                           0, XFS_TRANS_PERM_LOG_RES,
1198                                           XFS_ITRUNCATE_LOG_COUNT);
1199                 if (error) {
1200                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1201                         xfs_trans_cancel(tp, 0);
1202                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1203                         return error;
1204                 }
1205
1206                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1207                 xfs_trans_ijoin(tp, ip,
1208                                 XFS_IOLOCK_EXCL |
1209                                 XFS_ILOCK_EXCL);
1210                 xfs_trans_ihold(tp, ip);
1211
1212                 error = xfs_itruncate_finish(&tp, ip,
1213                                              ip->i_size,
1214                                              XFS_DATA_FORK,
1215                                              0);
1216                 /*
1217                  * If we get an error at this point we
1218                  * simply don't bother truncating the file.
1219                  */
1220                 if (error) {
1221                         xfs_trans_cancel(tp,
1222                                          (XFS_TRANS_RELEASE_LOG_RES |
1223                                           XFS_TRANS_ABORT));
1224                 } else {
1225                         error = xfs_trans_commit(tp,
1226                                                 XFS_TRANS_RELEASE_LOG_RES);
1227                 }
1228                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1229                                             : XFS_ILOCK_EXCL));
1230         }
1231         return error;
1232 }
1233
1234 /*
1235  * Free a symlink that has blocks associated with it.
1236  */
1237 STATIC int
1238 xfs_inactive_symlink_rmt(
1239         xfs_inode_t     *ip,
1240         xfs_trans_t     **tpp)
1241 {
1242         xfs_buf_t       *bp;
1243         int             committed;
1244         int             done;
1245         int             error;
1246         xfs_fsblock_t   first_block;
1247         xfs_bmap_free_t free_list;
1248         int             i;
1249         xfs_mount_t     *mp;
1250         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1251         int             nmaps;
1252         xfs_trans_t     *ntp;
1253         int             size;
1254         xfs_trans_t     *tp;
1255
1256         tp = *tpp;
1257         mp = ip->i_mount;
1258         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1259         /*
1260          * We're freeing a symlink that has some
1261          * blocks allocated to it.  Free the
1262          * blocks here.  We know that we've got
1263          * either 1 or 2 extents and that we can
1264          * free them all in one bunmapi call.
1265          */
1266         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1267         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1268                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1269                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1270                 xfs_trans_cancel(tp, 0);
1271                 *tpp = NULL;
1272                 return error;
1273         }
1274         /*
1275          * Lock the inode, fix the size, and join it to the transaction.
1276          * Hold it so in the normal path, we still have it locked for
1277          * the second transaction.  In the error paths we need it
1278          * held so the cancel won't rele it, see below.
1279          */
1280         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1281         size = (int)ip->i_d.di_size;
1282         ip->i_d.di_size = 0;
1283         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1284         xfs_trans_ihold(tp, ip);
1285         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1286         /*
1287          * Find the block(s) so we can inval and unmap them.
1288          */
1289         done = 0;
1290         XFS_BMAP_INIT(&free_list, &first_block);
1291         nmaps = ARRAY_SIZE(mval);
1292         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1293                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1294                         &free_list, NULL)))
1295                 goto error0;
1296         /*
1297          * Invalidate the block(s).
1298          */
1299         for (i = 0; i < nmaps; i++) {
1300                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1301                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1302                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1303                 xfs_trans_binval(tp, bp);
1304         }
1305         /*
1306          * Unmap the dead block(s) to the free_list.
1307          */
1308         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1309                         &first_block, &free_list, NULL, &done)))
1310                 goto error1;
1311         ASSERT(done);
1312         /*
1313          * Commit the first transaction.  This logs the EFI and the inode.
1314          */
1315         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1316                 goto error1;
1317         /*
1318          * The transaction must have been committed, since there were
1319          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1320          * The new tp has the extent freeing and EFDs.
1321          */
1322         ASSERT(committed);
1323         /*
1324          * The first xact was committed, so add the inode to the new one.
1325          * Mark it dirty so it will be logged and moved forward in the log as
1326          * part of every commit.
1327          */
1328         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1329         xfs_trans_ihold(tp, ip);
1330         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1331         /*
1332          * Get a new, empty transaction to return to our caller.
1333          */
1334         ntp = xfs_trans_dup(tp);
1335         /*
1336          * Commit the transaction containing extent freeing and EFDs.
1337          * If we get an error on the commit here or on the reserve below,
1338          * we need to unlock the inode since the new transaction doesn't
1339          * have the inode attached.
1340          */
1341         error = xfs_trans_commit(tp, 0);
1342         tp = ntp;
1343         if (error) {
1344                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1345                 goto error0;
1346         }
1347         /*
1348          * Remove the memory for extent descriptions (just bookkeeping).
1349          */
1350         if (ip->i_df.if_bytes)
1351                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1352         ASSERT(ip->i_df.if_bytes == 0);
1353         /*
1354          * Put an itruncate log reservation in the new transaction
1355          * for our caller.
1356          */
1357         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1358                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1359                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1360                 goto error0;
1361         }
1362         /*
1363          * Return with the inode locked but not joined to the transaction.
1364          */
1365         *tpp = tp;
1366         return 0;
1367
1368  error1:
1369         xfs_bmap_cancel(&free_list);
1370  error0:
1371         /*
1372          * Have to come here with the inode locked and either
1373          * (held and in the transaction) or (not in the transaction).
1374          * If the inode isn't held then cancel would iput it, but
1375          * that's wrong since this is inactive and the vnode ref
1376          * count is 0 already.
1377          * Cancel won't do anything to the inode if held, but it still
1378          * needs to be locked until the cancel is done, if it was
1379          * joined to the transaction.
1380          */
1381         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1382         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1383         *tpp = NULL;
1384         return error;
1385
1386 }
1387
1388 STATIC int
1389 xfs_inactive_symlink_local(
1390         xfs_inode_t     *ip,
1391         xfs_trans_t     **tpp)
1392 {
1393         int             error;
1394
1395         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1396         /*
1397          * We're freeing a symlink which fit into
1398          * the inode.  Just free the memory used
1399          * to hold the old symlink.
1400          */
1401         error = xfs_trans_reserve(*tpp, 0,
1402                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1403                                   0, XFS_TRANS_PERM_LOG_RES,
1404                                   XFS_ITRUNCATE_LOG_COUNT);
1405
1406         if (error) {
1407                 xfs_trans_cancel(*tpp, 0);
1408                 *tpp = NULL;
1409                 return error;
1410         }
1411         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1412
1413         /*
1414          * Zero length symlinks _can_ exist.
1415          */
1416         if (ip->i_df.if_bytes > 0) {
1417                 xfs_idata_realloc(ip,
1418                                   -(ip->i_df.if_bytes),
1419                                   XFS_DATA_FORK);
1420                 ASSERT(ip->i_df.if_bytes == 0);
1421         }
1422         return 0;
1423 }
1424
1425 STATIC int
1426 xfs_inactive_attrs(
1427         xfs_inode_t     *ip,
1428         xfs_trans_t     **tpp)
1429 {
1430         xfs_trans_t     *tp;
1431         int             error;
1432         xfs_mount_t     *mp;
1433
1434         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1435         tp = *tpp;
1436         mp = ip->i_mount;
1437         ASSERT(ip->i_d.di_forkoff != 0);
1438         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1439         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1440         if (error)
1441                 goto error_unlock;
1442
1443         error = xfs_attr_inactive(ip);
1444         if (error)
1445                 goto error_unlock;
1446
1447         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1448         error = xfs_trans_reserve(tp, 0,
1449                                   XFS_IFREE_LOG_RES(mp),
1450                                   0, XFS_TRANS_PERM_LOG_RES,
1451                                   XFS_INACTIVE_LOG_COUNT);
1452         if (error)
1453                 goto error_cancel;
1454
1455         xfs_ilock(ip, XFS_ILOCK_EXCL);
1456         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1457         xfs_trans_ihold(tp, ip);
1458         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1459
1460         ASSERT(ip->i_d.di_anextents == 0);
1461
1462         *tpp = tp;
1463         return 0;
1464
1465 error_cancel:
1466         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1467         xfs_trans_cancel(tp, 0);
1468 error_unlock:
1469         *tpp = NULL;
1470         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1471         return error;
1472 }
1473
1474 int
1475 xfs_release(
1476         xfs_inode_t     *ip)
1477 {
1478         bhv_vnode_t     *vp = XFS_ITOV(ip);
1479         xfs_mount_t     *mp = ip->i_mount;
1480         int             error;
1481
1482         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
1483                 return 0;
1484
1485         /* If this is a read-only mount, don't do this (would generate I/O) */
1486         if (mp->m_flags & XFS_MOUNT_RDONLY)
1487                 return 0;
1488
1489         if (!XFS_FORCED_SHUTDOWN(mp)) {
1490                 int truncated;
1491
1492                 /*
1493                  * If we are using filestreams, and we have an unlinked
1494                  * file that we are processing the last close on, then nothing
1495                  * will be able to reopen and write to this file. Purge this
1496                  * inode from the filestreams cache so that it doesn't delay
1497                  * teardown of the inode.
1498                  */
1499                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1500                         xfs_filestream_deassociate(ip);
1501
1502                 /*
1503                  * If we previously truncated this file and removed old data
1504                  * in the process, we want to initiate "early" writeout on
1505                  * the last close.  This is an attempt to combat the notorious
1506                  * NULL files problem which is particularly noticable from a
1507                  * truncate down, buffered (re-)write (delalloc), followed by
1508                  * a crash.  What we are effectively doing here is
1509                  * significantly reducing the time window where we'd otherwise
1510                  * be exposed to that problem.
1511                  */
1512                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1513                 if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1514                         xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1515         }
1516
1517         if (ip->i_d.di_nlink != 0) {
1518                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1519                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1520                        ip->i_delayed_blks > 0)) &&
1521                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1522                     (!(ip->i_d.di_flags &
1523                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1524                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1525                         if (error)
1526                                 return error;
1527                 }
1528         }
1529
1530         return 0;
1531 }
1532
1533 /*
1534  * xfs_inactive
1535  *
1536  * This is called when the vnode reference count for the vnode
1537  * goes to zero.  If the file has been unlinked, then it must
1538  * now be truncated.  Also, we clear all of the read-ahead state
1539  * kept for the inode here since the file is now closed.
1540  */
1541 int
1542 xfs_inactive(
1543         xfs_inode_t     *ip)
1544 {
1545         bhv_vnode_t     *vp = XFS_ITOV(ip);
1546         xfs_bmap_free_t free_list;
1547         xfs_fsblock_t   first_block;
1548         int             committed;
1549         xfs_trans_t     *tp;
1550         xfs_mount_t     *mp;
1551         int             error;
1552         int             truncate;
1553
1554         xfs_itrace_entry(ip);
1555
1556         /*
1557          * If the inode is already free, then there can be nothing
1558          * to clean up here.
1559          */
1560         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1561                 ASSERT(ip->i_df.if_real_bytes == 0);
1562                 ASSERT(ip->i_df.if_broot_bytes == 0);
1563                 return VN_INACTIVE_CACHE;
1564         }
1565
1566         /*
1567          * Only do a truncate if it's a regular file with
1568          * some actual space in it.  It's OK to look at the
1569          * inode's fields without the lock because we're the
1570          * only one with a reference to the inode.
1571          */
1572         truncate = ((ip->i_d.di_nlink == 0) &&
1573             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1574              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1575             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1576
1577         mp = ip->i_mount;
1578
1579         if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
1580                 XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
1581
1582         error = 0;
1583
1584         /* If this is a read-only mount, don't do this (would generate I/O) */
1585         if (mp->m_flags & XFS_MOUNT_RDONLY)
1586                 goto out;
1587
1588         if (ip->i_d.di_nlink != 0) {
1589                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1590                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1591                        ip->i_delayed_blks > 0)) &&
1592                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1593                      (!(ip->i_d.di_flags &
1594                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1595                       (ip->i_delayed_blks != 0)))) {
1596                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1597                         if (error)
1598                                 return VN_INACTIVE_CACHE;
1599                 }
1600                 goto out;
1601         }
1602
1603         ASSERT(ip->i_d.di_nlink == 0);
1604
1605         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1606                 return VN_INACTIVE_CACHE;
1607
1608         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1609         if (truncate) {
1610                 /*
1611                  * Do the xfs_itruncate_start() call before
1612                  * reserving any log space because itruncate_start
1613                  * will call into the buffer cache and we can't
1614                  * do that within a transaction.
1615                  */
1616                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1617
1618                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1619                 if (error) {
1620                         xfs_trans_cancel(tp, 0);
1621                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1622                         return VN_INACTIVE_CACHE;
1623                 }
1624
1625                 error = xfs_trans_reserve(tp, 0,
1626                                           XFS_ITRUNCATE_LOG_RES(mp),
1627                                           0, XFS_TRANS_PERM_LOG_RES,
1628                                           XFS_ITRUNCATE_LOG_COUNT);
1629                 if (error) {
1630                         /* Don't call itruncate_cleanup */
1631                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1632                         xfs_trans_cancel(tp, 0);
1633                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1634                         return VN_INACTIVE_CACHE;
1635                 }
1636
1637                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1638                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1639                 xfs_trans_ihold(tp, ip);
1640
1641                 /*
1642                  * normally, we have to run xfs_itruncate_finish sync.
1643                  * But if filesystem is wsync and we're in the inactive
1644                  * path, then we know that nlink == 0, and that the
1645                  * xaction that made nlink == 0 is permanently committed
1646                  * since xfs_remove runs as a synchronous transaction.
1647                  */
1648                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1649                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1650
1651                 if (error) {
1652                         xfs_trans_cancel(tp,
1653                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1654                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1655                         return VN_INACTIVE_CACHE;
1656                 }
1657         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1658
1659                 /*
1660                  * If we get an error while cleaning up a
1661                  * symlink we bail out.
1662                  */
1663                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1664                         xfs_inactive_symlink_rmt(ip, &tp) :
1665                         xfs_inactive_symlink_local(ip, &tp);
1666
1667                 if (error) {
1668                         ASSERT(tp == NULL);
1669                         return VN_INACTIVE_CACHE;
1670                 }
1671
1672                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1673                 xfs_trans_ihold(tp, ip);
1674         } else {
1675                 error = xfs_trans_reserve(tp, 0,
1676                                           XFS_IFREE_LOG_RES(mp),
1677                                           0, XFS_TRANS_PERM_LOG_RES,
1678                                           XFS_INACTIVE_LOG_COUNT);
1679                 if (error) {
1680                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1681                         xfs_trans_cancel(tp, 0);
1682                         return VN_INACTIVE_CACHE;
1683                 }
1684
1685                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1686                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1687                 xfs_trans_ihold(tp, ip);
1688         }
1689
1690         /*
1691          * If there are attributes associated with the file
1692          * then blow them away now.  The code calls a routine
1693          * that recursively deconstructs the attribute fork.
1694          * We need to just commit the current transaction
1695          * because we can't use it for xfs_attr_inactive().
1696          */
1697         if (ip->i_d.di_anextents > 0) {
1698                 error = xfs_inactive_attrs(ip, &tp);
1699                 /*
1700                  * If we got an error, the transaction is already
1701                  * cancelled, and the inode is unlocked. Just get out.
1702                  */
1703                  if (error)
1704                          return VN_INACTIVE_CACHE;
1705         } else if (ip->i_afp) {
1706                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1707         }
1708
1709         /*
1710          * Free the inode.
1711          */
1712         XFS_BMAP_INIT(&free_list, &first_block);
1713         error = xfs_ifree(tp, ip, &free_list);
1714         if (error) {
1715                 /*
1716                  * If we fail to free the inode, shut down.  The cancel
1717                  * might do that, we need to make sure.  Otherwise the
1718                  * inode might be lost for a long time or forever.
1719                  */
1720                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1721                         cmn_err(CE_NOTE,
1722                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1723                                 error, mp->m_fsname);
1724                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1725                 }
1726                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1727         } else {
1728                 /*
1729                  * Credit the quota account(s). The inode is gone.
1730                  */
1731                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1732
1733                 /*
1734                  * Just ignore errors at this point.  There is nothing we can
1735                  * do except to try to keep going. Make sure it's not a silent
1736                  * error.
1737                  */
1738                 error = xfs_bmap_finish(&tp,  &free_list, &committed);
1739                 if (error)
1740                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1741                                 "xfs_bmap_finish() returned error %d", error);
1742                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1743                 if (error)
1744                         xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1745                                 "xfs_trans_commit() returned error %d", error);
1746         }
1747         /*
1748          * Release the dquots held by inode, if any.
1749          */
1750         XFS_QM_DQDETACH(mp, ip);
1751
1752         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1753
1754  out:
1755         return VN_INACTIVE_CACHE;
1756 }
1757
1758
1759 int
1760 xfs_lookup(
1761         xfs_inode_t             *dp,
1762         struct xfs_name         *name,
1763         xfs_inode_t             **ipp)
1764 {
1765         xfs_inode_t             *ip;
1766         xfs_ino_t               e_inum;
1767         int                     error;
1768         uint                    lock_mode;
1769
1770         xfs_itrace_entry(dp);
1771
1772         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1773                 return XFS_ERROR(EIO);
1774
1775         lock_mode = xfs_ilock_map_shared(dp);
1776         error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip);
1777         if (!error) {
1778                 *ipp = ip;
1779                 xfs_itrace_ref(ip);
1780         }
1781         xfs_iunlock_map_shared(dp, lock_mode);
1782         return error;
1783 }
1784
1785 int
1786 xfs_create(
1787         xfs_inode_t             *dp,
1788         struct xfs_name         *name,
1789         mode_t                  mode,
1790         xfs_dev_t               rdev,
1791         xfs_inode_t             **ipp,
1792         cred_t                  *credp)
1793 {
1794         xfs_mount_t             *mp = dp->i_mount;
1795         xfs_inode_t             *ip;
1796         xfs_trans_t             *tp;
1797         int                     error;
1798         xfs_bmap_free_t         free_list;
1799         xfs_fsblock_t           first_block;
1800         boolean_t               unlock_dp_on_error = B_FALSE;
1801         int                     dm_event_sent = 0;
1802         uint                    cancel_flags;
1803         int                     committed;
1804         xfs_prid_t              prid;
1805         struct xfs_dquot        *udqp, *gdqp;
1806         uint                    resblks;
1807
1808         ASSERT(!*ipp);
1809         xfs_itrace_entry(dp);
1810
1811         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1812                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1813                                 dp, DM_RIGHT_NULL, NULL,
1814                                 DM_RIGHT_NULL, name->name, NULL,
1815                                 mode, 0, 0);
1816
1817                 if (error)
1818                         return error;
1819                 dm_event_sent = 1;
1820         }
1821
1822         if (XFS_FORCED_SHUTDOWN(mp))
1823                 return XFS_ERROR(EIO);
1824
1825         /* Return through std_return after this point. */
1826
1827         udqp = gdqp = NULL;
1828         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1829                 prid = dp->i_d.di_projid;
1830         else
1831                 prid = (xfs_prid_t)dfltprid;
1832
1833         /*
1834          * Make sure that we have allocated dquot(s) on disk.
1835          */
1836         error = XFS_QM_DQVOPALLOC(mp, dp,
1837                         current_fsuid(credp), current_fsgid(credp), prid,
1838                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1839         if (error)
1840                 goto std_return;
1841
1842         ip = NULL;
1843
1844         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1845         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1846         resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1847         /*
1848          * Initially assume that the file does not exist and
1849          * reserve the resources for that case.  If that is not
1850          * the case we'll drop the one we have and get a more
1851          * appropriate transaction later.
1852          */
1853         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1854                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1855         if (error == ENOSPC) {
1856                 resblks = 0;
1857                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1858                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1859         }
1860         if (error) {
1861                 cancel_flags = 0;
1862                 goto error_return;
1863         }
1864
1865         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1866         unlock_dp_on_error = B_TRUE;
1867
1868         XFS_BMAP_INIT(&free_list, &first_block);
1869
1870         ASSERT(ip == NULL);
1871
1872         /*
1873          * Reserve disk quota and the inode.
1874          */
1875         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1876         if (error)
1877                 goto error_return;
1878
1879         error = xfs_dir_canenter(tp, dp, name, resblks);
1880         if (error)
1881                 goto error_return;
1882         error = xfs_dir_ialloc(&tp, dp, mode, 1,
1883                         rdev, credp, prid, resblks > 0,
1884                         &ip, &committed);
1885         if (error) {
1886                 if (error == ENOSPC)
1887                         goto error_return;
1888                 goto abort_return;
1889         }
1890         xfs_itrace_ref(ip);
1891
1892         /*
1893          * At this point, we've gotten a newly allocated inode.
1894          * It is locked (and joined to the transaction).
1895          */
1896
1897         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1898
1899         /*
1900          * Now we join the directory inode to the transaction.  We do not do it
1901          * earlier because xfs_dir_ialloc might commit the previous transaction
1902          * (and release all the locks).  An error from here on will result in
1903          * the transaction cancel unlocking dp so don't do it explicitly in the
1904          * error path.
1905          */
1906         IHOLD(dp);
1907         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1908         unlock_dp_on_error = B_FALSE;
1909
1910         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1911                                         &first_block, &free_list, resblks ?
1912                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1913         if (error) {
1914                 ASSERT(error != ENOSPC);
1915                 goto abort_return;
1916         }
1917         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1918         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1919
1920         /*
1921          * If this is a synchronous mount, make sure that the
1922          * create transaction goes to disk before returning to
1923          * the user.
1924          */
1925         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1926                 xfs_trans_set_sync(tp);
1927         }
1928
1929         dp->i_gen++;
1930
1931         /*
1932          * Attach the dquot(s) to the inodes and modify them incore.
1933          * These ids of the inode couldn't have changed since the new
1934          * inode has been locked ever since it was created.
1935          */
1936         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
1937
1938         /*
1939          * xfs_trans_commit normally decrements the vnode ref count
1940          * when it unlocks the inode. Since we want to return the
1941          * vnode to the caller, we bump the vnode ref count now.
1942          */
1943         IHOLD(ip);
1944
1945         error = xfs_bmap_finish(&tp, &free_list, &committed);
1946         if (error) {
1947                 xfs_bmap_cancel(&free_list);
1948                 goto abort_rele;
1949         }
1950
1951         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1952         if (error) {
1953                 IRELE(ip);
1954                 tp = NULL;
1955                 goto error_return;
1956         }
1957
1958         XFS_QM_DQRELE(mp, udqp);
1959         XFS_QM_DQRELE(mp, gdqp);
1960
1961         *ipp = ip;
1962
1963         /* Fallthrough to std_return with error = 0  */
1964
1965 std_return:
1966         if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
1967             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1968                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
1969                         dp, DM_RIGHT_NULL,
1970                         *ipp ? ip : NULL,
1971                         DM_RIGHT_NULL, name->name, NULL,
1972                         mode, error, 0);
1973         }
1974         return error;
1975
1976  abort_return:
1977         cancel_flags |= XFS_TRANS_ABORT;
1978         /* FALLTHROUGH */
1979
1980  error_return:
1981         if (tp != NULL)
1982                 xfs_trans_cancel(tp, cancel_flags);
1983
1984         XFS_QM_DQRELE(mp, udqp);
1985         XFS_QM_DQRELE(mp, gdqp);
1986
1987         if (unlock_dp_on_error)
1988                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1989
1990         goto std_return;
1991
1992  abort_rele:
1993         /*
1994          * Wait until after the current transaction is aborted to
1995          * release the inode.  This prevents recursive transactions
1996          * and deadlocks from xfs_inactive.
1997          */
1998         cancel_flags |= XFS_TRANS_ABORT;
1999         xfs_trans_cancel(tp, cancel_flags);
2000         IRELE(ip);
2001
2002         XFS_QM_DQRELE(mp, udqp);
2003         XFS_QM_DQRELE(mp, gdqp);
2004
2005         goto std_return;
2006 }
2007
2008 #ifdef DEBUG
2009 /*
2010  * Some counters to see if (and how often) we are hitting some deadlock
2011  * prevention code paths.
2012  */
2013
2014 int xfs_rm_locks;
2015 int xfs_rm_lock_delays;
2016 int xfs_rm_attempts;
2017 #endif
2018
2019 /*
2020  * The following routine will lock the inodes associated with the
2021  * directory and the named entry in the directory. The locks are
2022  * acquired in increasing inode number.
2023  *
2024  * If the entry is "..", then only the directory is locked. The
2025  * vnode ref count will still include that from the .. entry in
2026  * this case.
2027  *
2028  * There is a deadlock we need to worry about. If the locked directory is
2029  * in the AIL, it might be blocking up the log. The next inode we lock
2030  * could be already locked by another thread waiting for log space (e.g
2031  * a permanent log reservation with a long running transaction (see
2032  * xfs_itruncate_finish)). To solve this, we must check if the directory
2033  * is in the ail and use lock_nowait. If we can't lock, we need to
2034  * drop the inode lock on the directory and try again. xfs_iunlock will
2035  * potentially push the tail if we were holding up the log.
2036  */
2037 STATIC int
2038 xfs_lock_dir_and_entry(
2039         xfs_inode_t     *dp,
2040         xfs_inode_t     *ip)    /* inode of entry 'name' */
2041 {
2042         int             attempts;
2043         xfs_ino_t       e_inum;
2044         xfs_inode_t     *ips[2];
2045         xfs_log_item_t  *lp;
2046
2047 #ifdef DEBUG
2048         xfs_rm_locks++;
2049 #endif
2050         attempts = 0;
2051
2052 again:
2053         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2054
2055         e_inum = ip->i_ino;
2056
2057         xfs_itrace_ref(ip);
2058
2059         /*
2060          * We want to lock in increasing inum. Since we've already
2061          * acquired the lock on the directory, we may need to release
2062          * if if the inum of the entry turns out to be less.
2063          */
2064         if (e_inum > dp->i_ino) {
2065                 /*
2066                  * We are already in the right order, so just
2067                  * lock on the inode of the entry.
2068                  * We need to use nowait if dp is in the AIL.
2069                  */
2070
2071                 lp = (xfs_log_item_t *)dp->i_itemp;
2072                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2073                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2074                                 attempts++;
2075 #ifdef DEBUG
2076                                 xfs_rm_attempts++;
2077 #endif
2078
2079                                 /*
2080                                  * Unlock dp and try again.
2081                                  * xfs_iunlock will try to push the tail
2082                                  * if the inode is in the AIL.
2083                                  */
2084
2085                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2086
2087                                 if ((attempts % 5) == 0) {
2088                                         delay(1); /* Don't just spin the CPU */
2089 #ifdef DEBUG
2090                                         xfs_rm_lock_delays++;
2091 #endif
2092                                 }
2093                                 goto again;
2094                         }
2095                 } else {
2096                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2097                 }
2098         } else if (e_inum < dp->i_ino) {
2099                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2100
2101                 ips[0] = ip;
2102                 ips[1] = dp;
2103                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2104         }
2105         /* else  e_inum == dp->i_ino */
2106         /*     This can happen if we're asked to lock /x/..
2107          *     the entry is "..", which is also the parent directory.
2108          */
2109
2110         return 0;
2111 }
2112
2113 #ifdef DEBUG
2114 int xfs_locked_n;
2115 int xfs_small_retries;
2116 int xfs_middle_retries;
2117 int xfs_lots_retries;
2118 int xfs_lock_delays;
2119 #endif
2120
2121 /*
2122  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2123  * a different value
2124  */
2125 static inline int
2126 xfs_lock_inumorder(int lock_mode, int subclass)
2127 {
2128         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2129                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2130         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2131                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2132
2133         return lock_mode;
2134 }
2135
2136 /*
2137  * The following routine will lock n inodes in exclusive mode.
2138  * We assume the caller calls us with the inodes in i_ino order.
2139  *
2140  * We need to detect deadlock where an inode that we lock
2141  * is in the AIL and we start waiting for another inode that is locked
2142  * by a thread in a long running transaction (such as truncate). This can
2143  * result in deadlock since the long running trans might need to wait
2144  * for the inode we just locked in order to push the tail and free space
2145  * in the log.
2146  */
2147 void
2148 xfs_lock_inodes(
2149         xfs_inode_t     **ips,
2150         int             inodes,
2151         int             first_locked,
2152         uint            lock_mode)
2153 {
2154         int             attempts = 0, i, j, try_lock;
2155         xfs_log_item_t  *lp;
2156
2157         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2158
2159         if (first_locked) {
2160                 try_lock = 1;
2161                 i = 1;
2162         } else {
2163                 try_lock = 0;
2164                 i = 0;
2165         }
2166
2167 again:
2168         for (; i < inodes; i++) {
2169                 ASSERT(ips[i]);
2170
2171                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2172                         continue;
2173
2174                 /*
2175                  * If try_lock is not set yet, make sure all locked inodes
2176                  * are not in the AIL.
2177                  * If any are, set try_lock to be used later.
2178                  */
2179
2180                 if (!try_lock) {
2181                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2182                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2183                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2184                                         try_lock++;
2185                                 }
2186                         }
2187                 }
2188
2189                 /*
2190                  * If any of the previous locks we have locked is in the AIL,
2191                  * we must TRY to get the second and subsequent locks. If
2192                  * we can't get any, we must release all we have
2193                  * and try again.
2194                  */
2195
2196                 if (try_lock) {
2197                         /* try_lock must be 0 if i is 0. */
2198                         /*
2199                          * try_lock means we have an inode locked
2200                          * that is in the AIL.
2201                          */
2202                         ASSERT(i != 0);
2203                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2204                                 attempts++;
2205
2206                                 /*
2207                                  * Unlock all previous guys and try again.
2208                                  * xfs_iunlock will try to push the tail
2209                                  * if the inode is in the AIL.
2210                                  */
2211
2212                                 for(j = i - 1; j >= 0; j--) {
2213
2214                                         /*
2215                                          * Check to see if we've already
2216                                          * unlocked this one.
2217                                          * Not the first one going back,
2218                                          * and the inode ptr is the same.
2219                                          */
2220                                         if ((j != (i - 1)) && ips[j] ==
2221                                                                 ips[j+1])
2222                                                 continue;
2223
2224                                         xfs_iunlock(ips[j], lock_mode);
2225                                 }
2226
2227                                 if ((attempts % 5) == 0) {
2228                                         delay(1); /* Don't just spin the CPU */
2229 #ifdef DEBUG
2230                                         xfs_lock_delays++;
2231 #endif
2232                                 }
2233                                 i = 0;
2234                                 try_lock = 0;
2235                                 goto again;
2236                         }
2237                 } else {
2238                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2239                 }
2240         }
2241
2242 #ifdef DEBUG
2243         if (attempts) {
2244                 if (attempts < 5) xfs_small_retries++;
2245                 else if (attempts < 100) xfs_middle_retries++;
2246                 else xfs_lots_retries++;
2247         } else {
2248                 xfs_locked_n++;
2249         }
2250 #endif
2251 }
2252
2253 #ifdef  DEBUG
2254 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2255 int remove_which_error_return = 0;
2256 #else /* ! DEBUG */
2257 #define REMOVE_DEBUG_TRACE(x)
2258 #endif  /* ! DEBUG */
2259
2260 int
2261 xfs_remove(
2262         xfs_inode_t             *dp,
2263         struct xfs_name         *name,
2264         xfs_inode_t             *ip)
2265 {
2266         xfs_mount_t             *mp = dp->i_mount;
2267         xfs_trans_t             *tp = NULL;
2268         int                     error = 0;
2269         xfs_bmap_free_t         free_list;
2270         xfs_fsblock_t           first_block;
2271         int                     cancel_flags;
2272         int                     committed;
2273         int                     link_zero;
2274         uint                    resblks;
2275
2276         xfs_itrace_entry(dp);
2277
2278         if (XFS_FORCED_SHUTDOWN(mp))
2279                 return XFS_ERROR(EIO);
2280
2281         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2282                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
2283                                         NULL, DM_RIGHT_NULL, name->name, NULL,
2284                                         ip->i_d.di_mode, 0, 0);
2285                 if (error)
2286                         return error;
2287         }
2288
2289         /*
2290          * We need to get a reference to ip before we get our log
2291          * reservation. The reason for this is that we cannot call
2292          * xfs_iget for an inode for which we do not have a reference
2293          * once we've acquired a log reservation. This is because the
2294          * inode we are trying to get might be in xfs_inactive going
2295          * for a log reservation. Since we'll have to wait for the
2296          * inactive code to complete before returning from xfs_iget,
2297          * we need to make sure that we don't have log space reserved
2298          * when we call xfs_iget.  Instead we get an unlocked reference
2299          * to the inode before getting our log reservation.
2300          */
2301         IHOLD(ip);
2302
2303         xfs_itrace_entry(ip);
2304         xfs_itrace_ref(ip);
2305
2306         error = XFS_QM_DQATTACH(mp, dp, 0);
2307         if (!error && dp != ip)
2308                 error = XFS_QM_DQATTACH(mp, ip, 0);
2309         if (error) {
2310                 REMOVE_DEBUG_TRACE(__LINE__);
2311                 IRELE(ip);
2312                 goto std_return;
2313         }
2314
2315         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2316         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2317         /*
2318          * We try to get the real space reservation first,
2319          * allowing for directory btree deletion(s) implying
2320          * possible bmap insert(s).  If we can't get the space
2321          * reservation then we use 0 instead, and avoid the bmap
2322          * btree insert(s) in the directory code by, if the bmap
2323          * insert tries to happen, instead trimming the LAST
2324          * block from the directory.
2325          */
2326         resblks = XFS_REMOVE_SPACE_RES(mp);
2327         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2328                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2329         if (error == ENOSPC) {
2330                 resblks = 0;
2331                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2332                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2333         }
2334         if (error) {
2335                 ASSERT(error != ENOSPC);
2336                 REMOVE_DEBUG_TRACE(__LINE__);
2337                 xfs_trans_cancel(tp, 0);
2338                 IRELE(ip);
2339                 return error;
2340         }
2341
2342         error = xfs_lock_dir_and_entry(dp, ip);
2343         if (error) {
2344                 REMOVE_DEBUG_TRACE(__LINE__);
2345                 xfs_trans_cancel(tp, cancel_flags);
2346                 IRELE(ip);
2347                 goto std_return;
2348         }
2349
2350         /*
2351          * At this point, we've gotten both the directory and the entry
2352          * inodes locked.
2353          */
2354         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2355         if (dp != ip) {
2356                 /*
2357                  * Increment vnode ref count only in this case since
2358                  * there's an extra vnode reference in the case where
2359                  * dp == ip.
2360                  */
2361                 IHOLD(dp);
2362                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2363         }
2364
2365         /*
2366          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2367          */
2368         XFS_BMAP_INIT(&free_list, &first_block);
2369         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2370                                         &first_block, &free_list, 0);
2371         if (error) {
2372                 ASSERT(error != ENOENT);
2373                 REMOVE_DEBUG_TRACE(__LINE__);
2374                 goto error1;
2375         }
2376         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2377
2378         dp->i_gen++;
2379         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2380
2381         error = xfs_droplink(tp, ip);
2382         if (error) {
2383                 REMOVE_DEBUG_TRACE(__LINE__);
2384                 goto error1;
2385         }
2386
2387         /* Determine if this is the last link while
2388          * we are in the transaction.
2389          */
2390         link_zero = (ip)->i_d.di_nlink==0;
2391
2392         /*
2393          * Take an extra ref on the inode so that it doesn't
2394          * go to xfs_inactive() from within the commit.
2395          */
2396         IHOLD(ip);
2397
2398         /*
2399          * If this is a synchronous mount, make sure that the
2400          * remove transaction goes to disk before returning to
2401          * the user.
2402          */
2403         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2404                 xfs_trans_set_sync(tp);
2405         }
2406
2407         error = xfs_bmap_finish(&tp, &free_list, &committed);
2408         if (error) {
2409                 REMOVE_DEBUG_TRACE(__LINE__);
2410                 goto error_rele;
2411         }
2412
2413         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2414         if (error) {
2415                 IRELE(ip);
2416                 goto std_return;
2417         }
2418
2419         /*
2420          * If we are using filestreams, kill the stream association.
2421          * If the file is still open it may get a new one but that
2422          * will get killed on last close in xfs_close() so we don't
2423          * have to worry about that.
2424          */
2425         if (link_zero && xfs_inode_is_filestream(ip))
2426                 xfs_filestream_deassociate(ip);
2427
2428         xfs_itrace_exit(ip);
2429         IRELE(ip);
2430
2431 /*      Fall through to std_return with error = 0 */
2432  std_return:
2433         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2434                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2435                                 dp, DM_RIGHT_NULL,
2436                                 NULL, DM_RIGHT_NULL,
2437                                 name->name, NULL, ip->i_d.di_mode, error, 0);
2438         }
2439         return error;
2440
2441  error1:
2442         xfs_bmap_cancel(&free_list);
2443         cancel_flags |= XFS_TRANS_ABORT;
2444         xfs_trans_cancel(tp, cancel_flags);
2445         goto std_return;
2446
2447  error_rele:
2448         /*
2449          * In this case make sure to not release the inode until after
2450          * the current transaction is aborted.  Releasing it beforehand
2451          * can cause us to go to xfs_inactive and start a recursive
2452          * transaction which can easily deadlock with the current one.
2453          */
2454         xfs_bmap_cancel(&free_list);
2455         cancel_flags |= XFS_TRANS_ABORT;
2456         xfs_trans_cancel(tp, cancel_flags);
2457
2458         IRELE(ip);
2459
2460         goto std_return;
2461 }
2462
2463 int
2464 xfs_link(
2465         xfs_inode_t             *tdp,
2466         xfs_inode_t             *sip,
2467         struct xfs_name         *target_name)
2468 {
2469         xfs_mount_t             *mp = tdp->i_mount;
2470         xfs_trans_t             *tp;
2471         xfs_inode_t             *ips[2];
2472         int                     error;
2473         xfs_bmap_free_t         free_list;
2474         xfs_fsblock_t           first_block;
2475         int                     cancel_flags;
2476         int                     committed;
2477         int                     resblks;
2478
2479         xfs_itrace_entry(tdp);
2480         xfs_itrace_entry(sip);
2481
2482         ASSERT(!S_ISDIR(sip->i_d.di_mode));
2483
2484         if (XFS_FORCED_SHUTDOWN(mp))
2485                 return XFS_ERROR(EIO);
2486
2487         if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2488                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2489                                         tdp, DM_RIGHT_NULL,
2490                                         sip, DM_RIGHT_NULL,
2491                                         target_name->name, NULL, 0, 0, 0);
2492                 if (error)
2493                         return error;
2494         }
2495
2496         /* Return through std_return after this point. */
2497
2498         error = XFS_QM_DQATTACH(mp, sip, 0);
2499         if (!error && sip != tdp)
2500                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2501         if (error)
2502                 goto std_return;
2503
2504         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2505         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2506         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
2507         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2508                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2509         if (error == ENOSPC) {
2510                 resblks = 0;
2511                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2512                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2513         }
2514         if (error) {
2515                 cancel_flags = 0;
2516                 goto error_return;
2517         }
2518
2519         if (sip->i_ino < tdp->i_ino) {
2520                 ips[0] = sip;
2521                 ips[1] = tdp;
2522         } else {
2523                 ips[0] = tdp;
2524                 ips[1] = sip;
2525         }
2526
2527         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2528
2529         /*
2530          * Increment vnode ref counts since xfs_trans_commit &
2531          * xfs_trans_cancel will both unlock the inodes and
2532          * decrement the associated ref counts.
2533          */
2534         IHOLD(sip);
2535         IHOLD(tdp);
2536         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2537         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2538
2539         /*
2540          * If the source has too many links, we can't make any more to it.
2541          */
2542         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2543                 error = XFS_ERROR(EMLINK);
2544                 goto error_return;
2545         }
2546
2547         /*
2548          * If we are using project inheritance, we only allow hard link
2549          * creation in our tree when the project IDs are the same; else
2550          * the tree quota mechanism could be circumvented.
2551          */
2552         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2553                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2554                 error = XFS_ERROR(EXDEV);
2555                 goto error_return;
2556         }
2557
2558         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
2559         if (error)
2560                 goto error_return;
2561
2562         XFS_BMAP_INIT(&free_list, &first_block);
2563
2564         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
2565                                         &first_block, &free_list, resblks);
2566         if (error)
2567                 goto abort_return;
2568         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2569         tdp->i_gen++;
2570         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2571
2572         error = xfs_bumplink(tp, sip);
2573         if (error)
2574                 goto abort_return;
2575
2576         /*
2577          * If this is a synchronous mount, make sure that the
2578          * link transaction goes to disk before returning to
2579          * the user.
2580          */
2581         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2582                 xfs_trans_set_sync(tp);
2583         }
2584
2585         error = xfs_bmap_finish (&tp, &free_list, &committed);
2586         if (error) {
2587                 xfs_bmap_cancel(&free_list);
2588                 goto abort_return;
2589         }
2590
2591         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2592         if (error)
2593                 goto std_return;
2594
2595         /* Fall through to std_return with error = 0. */
2596 std_return:
2597         if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2598                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2599                                 tdp, DM_RIGHT_NULL,
2600                                 sip, DM_RIGHT_NULL,
2601                                 target_name->name, NULL, 0, error, 0);
2602         }
2603         return error;
2604
2605  abort_return:
2606         cancel_flags |= XFS_TRANS_ABORT;
2607         /* FALLTHROUGH */
2608
2609  error_return:
2610         xfs_trans_cancel(tp, cancel_flags);
2611         goto std_return;
2612 }
2613
2614
2615 int
2616 xfs_mkdir(
2617         xfs_inode_t             *dp,
2618         struct xfs_name         *dir_name,
2619         mode_t                  mode,
2620         xfs_inode_t             **ipp,
2621         cred_t                  *credp)
2622 {
2623         xfs_mount_t             *mp = dp->i_mount;
2624         xfs_inode_t             *cdp;   /* inode of created dir */
2625         xfs_trans_t             *tp;
2626         int                     cancel_flags;
2627         int                     error;
2628         int                     committed;
2629         xfs_bmap_free_t         free_list;
2630         xfs_fsblock_t           first_block;
2631         boolean_t               unlock_dp_on_error = B_FALSE;
2632         boolean_t               created = B_FALSE;
2633         int                     dm_event_sent = 0;
2634         xfs_prid_t              prid;
2635         struct xfs_dquot        *udqp, *gdqp;
2636         uint                    resblks;
2637
2638         if (XFS_FORCED_SHUTDOWN(mp))
2639                 return XFS_ERROR(EIO);
2640
2641         tp = NULL;
2642
2643         if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2644                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2645                                         dp, DM_RIGHT_NULL, NULL,
2646                                         DM_RIGHT_NULL, dir_name->name, NULL,
2647                                         mode, 0, 0);
2648                 if (error)
2649                         return error;
2650                 dm_event_sent = 1;
2651         }
2652
2653         /* Return through std_return after this point. */
2654
2655         xfs_itrace_entry(dp);
2656
2657         mp = dp->i_mount;
2658         udqp = gdqp = NULL;
2659         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2660                 prid = dp->i_d.di_projid;
2661         else
2662                 prid = (xfs_prid_t)dfltprid;
2663
2664         /*
2665          * Make sure that we have allocated dquot(s) on disk.
2666          */
2667         error = XFS_QM_DQVOPALLOC(mp, dp,
2668                         current_fsuid(credp), current_fsgid(credp), prid,
2669                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2670         if (error)
2671                 goto std_return;
2672
2673         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2674         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2675         resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
2676         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2677                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2678         if (error == ENOSPC) {
2679                 resblks = 0;
2680                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2681                                           XFS_TRANS_PERM_LOG_RES,
2682                                           XFS_MKDIR_LOG_COUNT);
2683         }
2684         if (error) {
2685                 cancel_flags = 0;
2686                 goto error_return;
2687         }
2688
2689         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2690         unlock_dp_on_error = B_TRUE;
2691
2692         /*
2693          * Check for directory link count overflow.
2694          */
2695         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2696                 error = XFS_ERROR(EMLINK);
2697                 goto error_return;
2698         }
2699
2700         /*
2701          * Reserve disk quota and the inode.
2702          */
2703         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2704         if (error)
2705                 goto error_return;
2706
2707         error = xfs_dir_canenter(tp, dp, dir_name, resblks);
2708         if (error)
2709                 goto error_return;
2710         /*
2711          * create the directory inode.
2712          */
2713         error = xfs_dir_ialloc(&tp, dp, mode, 2,
2714                         0, credp, prid, resblks > 0,
2715                 &cdp, NULL);
2716         if (error) {
2717                 if (error == ENOSPC)
2718                         goto error_return;
2719                 goto abort_return;
2720         }
2721         xfs_itrace_ref(cdp);
2722
2723         /*
2724          * Now we add the directory inode to the transaction.
2725          * We waited until now since xfs_dir_ialloc might start
2726          * a new transaction.  Had we joined the transaction
2727          * earlier, the locks might have gotten released. An error
2728          * from here on will result in the transaction cancel
2729          * unlocking dp so don't do it explicitly in the error path.
2730          */
2731         IHOLD(dp);
2732         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2733         unlock_dp_on_error = B_FALSE;
2734
2735         XFS_BMAP_INIT(&free_list, &first_block);
2736
2737         error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2738                                         &first_block, &free_list, resblks ?
2739                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2740         if (error) {
2741                 ASSERT(error != ENOSPC);
2742                 goto error1;
2743         }
2744         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2745
2746         /*
2747          * Bump the in memory version number of the parent directory
2748          * so that other processes accessing it will recognize that
2749          * the directory has changed.
2750          */
2751         dp->i_gen++;
2752
2753         error = xfs_dir_init(tp, cdp, dp);
2754         if (error)
2755                 goto error2;
2756
2757         cdp->i_gen = 1;
2758         error = xfs_bumplink(tp, dp);
2759         if (error)
2760                 goto error2;
2761
2762         created = B_TRUE;
2763
2764         *ipp = cdp;
2765         IHOLD(cdp);
2766
2767         /*
2768          * Attach the dquots to the new inode and modify the icount incore.
2769          */
2770         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2771
2772         /*
2773          * If this is a synchronous mount, make sure that the
2774          * mkdir transaction goes to disk before returning to
2775          * the user.
2776          */
2777         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2778                 xfs_trans_set_sync(tp);
2779         }
2780
2781         error = xfs_bmap_finish(&tp, &free_list, &committed);
2782         if (error) {
2783                 IRELE(cdp);
2784                 goto error2;
2785         }
2786
2787         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2788         XFS_QM_DQRELE(mp, udqp);
2789         XFS_QM_DQRELE(mp, gdqp);
2790         if (error) {
2791                 IRELE(cdp);
2792         }
2793
2794         /* Fall through to std_return with error = 0 or errno from
2795          * xfs_trans_commit. */
2796
2797 std_return:
2798         if ((created || (error != 0 && dm_event_sent != 0)) &&
2799             DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2800                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2801                                         dp, DM_RIGHT_NULL,
2802                                         created ? cdp : NULL,
2803                                         DM_RIGHT_NULL,
2804                                         dir_name->name, NULL,
2805                                         mode, error, 0);
2806         }
2807         return error;
2808
2809  error2:
2810  error1:
2811         xfs_bmap_cancel(&free_list);
2812  abort_return:
2813         cancel_flags |= XFS_TRANS_ABORT;
2814  error_return:
2815         xfs_trans_cancel(tp, cancel_flags);
2816         XFS_QM_DQRELE(mp, udqp);
2817         XFS_QM_DQRELE(mp, gdqp);
2818
2819         if (unlock_dp_on_error)
2820                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2821
2822         goto std_return;
2823 }
2824
2825 int
2826 xfs_rmdir(
2827         xfs_inode_t             *dp,
2828         struct xfs_name         *name,
2829         xfs_inode_t             *cdp)
2830 {
2831         bhv_vnode_t             *dir_vp = XFS_ITOV(dp);
2832         xfs_mount_t             *mp = dp->i_mount;
2833         xfs_trans_t             *tp;
2834         int                     error;
2835         xfs_bmap_free_t         free_list;
2836         xfs_fsblock_t           first_block;
2837         int                     cancel_flags;
2838         int                     committed;
2839         int                     last_cdp_link;
2840         uint                    resblks;
2841
2842         xfs_itrace_entry(dp);
2843
2844         if (XFS_FORCED_SHUTDOWN(mp))
2845                 return XFS_ERROR(EIO);
2846
2847         if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2848                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
2849                                         dp, DM_RIGHT_NULL,
2850                                         NULL, DM_RIGHT_NULL, name->name,
2851                                         NULL, cdp->i_d.di_mode, 0, 0);
2852                 if (error)
2853                         return XFS_ERROR(error);
2854         }
2855
2856         /*
2857          * We need to get a reference to cdp before we get our log
2858          * reservation.  The reason for this is that we cannot call
2859          * xfs_iget for an inode for which we do not have a reference
2860          * once we've acquired a log reservation.  This is because the
2861          * inode we are trying to get might be in xfs_inactive going
2862          * for a log reservation.  Since we'll have to wait for the
2863          * inactive code to complete before returning from xfs_iget,
2864          * we need to make sure that we don't have log space reserved
2865          * when we call xfs_iget.  Instead we get an unlocked reference
2866          * to the inode before getting our log reservation.
2867          */
2868         IHOLD(cdp);
2869
2870         /*
2871          * Get the dquots for the inodes.
2872          */
2873         error = XFS_QM_DQATTACH(mp, dp, 0);
2874         if (!error && dp != cdp)
2875                 error = XFS_QM_DQATTACH(mp, cdp, 0);
2876         if (error) {
2877                 IRELE(cdp);
2878                 REMOVE_DEBUG_TRACE(__LINE__);
2879                 goto std_return;
2880         }
2881
2882         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2883         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2884         /*
2885          * We try to get the real space reservation first,
2886          * allowing for directory btree deletion(s) implying
2887          * possible bmap insert(s).  If we can't get the space
2888          * reservation then we use 0 instead, and avoid the bmap
2889          * btree insert(s) in the directory code by, if the bmap
2890          * insert tries to happen, instead trimming the LAST
2891          * block from the directory.
2892          */
2893         resblks = XFS_REMOVE_SPACE_RES(mp);
2894         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2895                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
2896         if (error == ENOSPC) {
2897                 resblks = 0;
2898                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2899                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
2900         }
2901         if (error) {
2902                 ASSERT(error != ENOSPC);
2903                 cancel_flags = 0;
2904                 IRELE(cdp);
2905                 goto error_return;
2906         }
2907         XFS_BMAP_INIT(&free_list, &first_block);
2908
2909         /*
2910          * Now lock the child directory inode and the parent directory
2911          * inode in the proper order.  This will take care of validating
2912          * that the directory entry for the child directory inode has
2913          * not changed while we were obtaining a log reservation.
2914          */
2915         error = xfs_lock_dir_and_entry(dp, cdp);
2916         if (error) {
2917                 xfs_trans_cancel(tp, cancel_flags);
2918                 IRELE(cdp);
2919                 goto std_return;
2920         }
2921
2922         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2923         if (dp != cdp) {
2924                 /*
2925                  * Only increment the parent directory vnode count if
2926                  * we didn't bump it in looking up cdp.  The only time
2927                  * we don't bump it is when we're looking up ".".
2928                  */
2929                 VN_HOLD(dir_vp);
2930         }
2931
2932         xfs_itrace_ref(cdp);
2933         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
2934
2935         ASSERT(cdp->i_d.di_nlink >= 2);
2936         if (cdp->i_d.di_nlink != 2) {
2937                 error = XFS_ERROR(ENOTEMPTY);
2938                 goto error_return;
2939         }
2940         if (!xfs_dir_isempty(cdp)) {
2941                 error = XFS_ERROR(ENOTEMPTY);
2942                 goto error_return;
2943         }
2944
2945         error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
2946                                         &first_block, &free_list, resblks);
2947         if (error)
2948                 goto error1;
2949
2950         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2951
2952         /*
2953          * Bump the in memory generation count on the parent
2954          * directory so that other can know that it has changed.
2955          */
2956         dp->i_gen++;
2957
2958         /*
2959          * Drop the link from cdp's "..".
2960          */
2961         error = xfs_droplink(tp, dp);
2962         if (error) {
2963                 goto error1;
2964         }
2965
2966         /*
2967          * Drop the link from dp to cdp.
2968          */
2969         error = xfs_droplink(tp, cdp);
2970         if (error) {
2971                 goto error1;
2972         }
2973
2974         /*
2975          * Drop the "." link from cdp to self.
2976          */
2977         error = xfs_droplink(tp, cdp);
2978         if (error) {
2979                 goto error1;
2980         }
2981
2982         /* Determine these before committing transaction */
2983         last_cdp_link = (cdp)->i_d.di_nlink==0;
2984
2985         /*
2986          * Take an extra ref on the child vnode so that it
2987          * does not go to xfs_inactive() from within the commit.
2988          */
2989         IHOLD(cdp);
2990
2991         /*
2992          * If this is a synchronous mount, make sure that the
2993          * rmdir transaction goes to disk before returning to
2994          * the user.
2995          */
2996         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2997                 xfs_trans_set_sync(tp);
2998         }
2999
3000         error = xfs_bmap_finish (&tp, &free_list, &committed);
3001         if (error) {
3002                 xfs_bmap_cancel(&free_list);
3003                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3004                                  XFS_TRANS_ABORT));
3005                 IRELE(cdp);
3006                 goto std_return;
3007         }
3008
3009         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3010         if (error) {
3011                 IRELE(cdp);
3012                 goto std_return;
3013         }
3014
3015
3016         IRELE(cdp);
3017
3018         /* Fall through to std_return with error = 0 or the errno
3019          * from xfs_trans_commit. */
3020  std_return:
3021         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
3022                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3023                                         dp, DM_RIGHT_NULL,
3024                                         NULL, DM_RIGHT_NULL,
3025                                         name->name, NULL, cdp->i_d.di_mode,
3026                                         error, 0);
3027         }
3028         return error;
3029
3030  error1:
3031         xfs_bmap_cancel(&free_list);
3032         cancel_flags |= XFS_TRANS_ABORT;
3033         /* FALLTHROUGH */
3034
3035  error_return:
3036         xfs_trans_cancel(tp, cancel_flags);
3037         goto std_return;
3038 }
3039
3040 int
3041 xfs_symlink(
3042         xfs_inode_t             *dp,
3043         struct xfs_name         *link_name,
3044         const char              *target_path,
3045         mode_t                  mode,
3046         xfs_inode_t             **ipp,
3047         cred_t                  *credp)
3048 {
3049         xfs_mount_t             *mp = dp->i_mount;
3050         xfs_trans_t             *tp;
3051         xfs_inode_t             *ip;
3052         int                     error;
3053         int                     pathlen;
3054         xfs_bmap_free_t         free_list;
3055         xfs_fsblock_t           first_block;
3056         boolean_t               unlock_dp_on_error = B_FALSE;
3057         uint                    cancel_flags;
3058         int                     committed;
3059         xfs_fileoff_t           first_fsb;
3060         xfs_filblks_t           fs_blocks;
3061         int                     nmaps;
3062         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3063         xfs_daddr_t             d;
3064         const char              *cur_chunk;
3065         int                     byte_cnt;
3066         int                     n;
3067         xfs_buf_t               *bp;
3068         xfs_prid_t              prid;
3069         struct xfs_dquot        *udqp, *gdqp;
3070         uint                    resblks;
3071
3072         *ipp = NULL;
3073         error = 0;
3074         ip = NULL;
3075         tp = NULL;
3076
3077         xfs_itrace_entry(dp);
3078
3079         if (XFS_FORCED_SHUTDOWN(mp))
3080                 return XFS_ERROR(EIO);
3081
3082         /*
3083          * Check component lengths of the target path name.
3084          */
3085         pathlen = strlen(target_path);
3086         if (pathlen >= MAXPATHLEN)      /* total string too long */
3087                 return XFS_ERROR(ENAMETOOLONG);
3088
3089         if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
3090                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
3091                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3092                                         link_name->name, target_path, 0, 0, 0);
3093                 if (error)
3094                         return error;
3095         }
3096
3097         /* Return through std_return after this point. */
3098
3099         udqp = gdqp = NULL;
3100         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3101                 prid = dp->i_d.di_projid;
3102         else
3103                 prid = (xfs_prid_t)dfltprid;
3104
3105         /*
3106          * Make sure that we have allocated dquot(s) on disk.
3107          */
3108         error = XFS_QM_DQVOPALLOC(mp, dp,
3109                         current_fsuid(credp), current_fsgid(credp), prid,
3110                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3111         if (error)
3112                 goto std_return;
3113
3114         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3115         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3116         /*
3117          * The symlink will fit into the inode data fork?
3118          * There can't be any attributes so we get the whole variable part.
3119          */
3120         if (pathlen <= XFS_LITINO(mp))
3121                 fs_blocks = 0;
3122         else
3123                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3124         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
3125         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3126                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3127         if (error == ENOSPC && fs_blocks == 0) {
3128                 resblks = 0;
3129                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3130                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3131         }
3132         if (error) {
3133                 cancel_flags = 0;
3134                 goto error_return;
3135         }
3136
3137         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3138         unlock_dp_on_error = B_TRUE;
3139
3140         /*
3141          * Check whether the directory allows new symlinks or not.
3142          */
3143         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3144                 error = XFS_ERROR(EPERM);
3145                 goto error_return;
3146         }
3147
3148         /*
3149          * Reserve disk quota : blocks and inode.
3150          */
3151         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3152         if (error)
3153                 goto error_return;
3154
3155         /*
3156          * Check for ability to enter directory entry, if no space reserved.
3157          */
3158         error = xfs_dir_canenter(tp, dp, link_name, resblks);
3159         if (error)
3160                 goto error_return;
3161         /*
3162          * Initialize the bmap freelist prior to calling either
3163          * bmapi or the directory create code.
3164          */
3165         XFS_BMAP_INIT(&free_list, &first_block);
3166
3167         /*
3168          * Allocate an inode for the symlink.
3169          */
3170         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT),
3171                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3172         if (error) {
3173                 if (error == ENOSPC)
3174                         goto error_return;
3175                 goto error1;
3176         }
3177         xfs_itrace_ref(ip);
3178
3179         /*
3180          * An error after we've joined dp to the transaction will result in the
3181          * transaction cancel unlocking dp so don't do it explicitly in the
3182          * error path.
3183          */
3184         IHOLD(dp);
3185         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3186         unlock_dp_on_error = B_FALSE;
3187
3188         /*
3189          * Also attach the dquot(s) to it, if applicable.
3190          */
3191         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3192
3193         if (resblks)
3194                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3195         /*
3196          * If the symlink will fit into the inode, write it inline.
3197          */
3198         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3199                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3200                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3201                 ip->i_d.di_size = pathlen;
3202
3203                 /*
3204                  * The inode was initially created in extent format.
3205                  */
3206                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3207                 ip->i_df.if_flags |= XFS_IFINLINE;
3208
3209                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3210                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3211
3212         } else {
3213                 first_fsb = 0;
3214                 nmaps = SYMLINK_MAPS;
3215
3216                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3217                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3218                                   &first_block, resblks, mval, &nmaps,
3219                                   &free_list, NULL);
3220                 if (error) {
3221                         goto error1;
3222                 }
3223
3224                 if (resblks)
3225                         resblks -= fs_blocks;
3226                 ip->i_d.di_size = pathlen;
3227                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3228
3229                 cur_chunk = target_path;
3230                 for (n = 0; n < nmaps; n++) {
3231                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3232                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3233                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3234                                                BTOBB(byte_cnt), 0);
3235                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3236                         if (pathlen < byte_cnt) {
3237                                 byte_cnt = pathlen;
3238                         }
3239                         pathlen -= byte_cnt;
3240
3241                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3242                         cur_chunk += byte_cnt;
3243
3244                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3245                 }
3246         }
3247
3248         /*
3249          * Create the directory entry for the symlink.
3250          */
3251         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
3252                                         &first_block, &free_list, resblks);
3253         if (error)
3254                 goto error1;
3255         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3256         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3257
3258         /*
3259          * Bump the in memory version number of the parent directory
3260          * so that other processes accessing it will recognize that
3261          * the directory has changed.
3262          */
3263         dp->i_gen++;
3264
3265         /*
3266          * If this is a synchronous mount, make sure that the
3267          * symlink transaction goes to disk before returning to
3268          * the user.
3269          */
3270         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3271                 xfs_trans_set_sync(tp);
3272         }
3273
3274         /*
3275          * xfs_trans_commit normally decrements the vnode ref count
3276          * when it unlocks the inode. Since we want to return the
3277          * vnode to the caller, we bump the vnode ref count now.
3278          */
3279         IHOLD(ip);
3280
3281         error = xfs_bmap_finish(&tp, &free_list, &committed);
3282         if (error) {
3283                 goto error2;
3284         }
3285         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3286         XFS_QM_DQRELE(mp, udqp);
3287         XFS_QM_DQRELE(mp, gdqp);
3288
3289         /* Fall through to std_return with error = 0 or errno from
3290          * xfs_trans_commit     */
3291 std_return:
3292         if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
3293                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3294                                         dp, DM_RIGHT_NULL,
3295                                         error ? NULL : ip,
3296                                         DM_RIGHT_NULL, link_name->name,
3297                                         target_path, 0, error, 0);
3298         }
3299
3300         if (!error)
3301                 *ipp = ip;
3302         return error;
3303
3304  error2:
3305         IRELE(ip);
3306  error1:
3307         xfs_bmap_cancel(&free_list);
3308         cancel_flags |= XFS_TRANS_ABORT;
3309  error_return:
3310         xfs_trans_cancel(tp, cancel_flags);
3311         XFS_QM_DQRELE(mp, udqp);
3312         XFS_QM_DQRELE(mp, gdqp);
3313
3314         if (unlock_dp_on_error)
3315                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3316
3317         goto std_return;
3318 }
3319
3320 int
3321 xfs_inode_flush(
3322         xfs_inode_t     *ip,
3323         int             flags)
3324 {
3325         xfs_mount_t     *mp = ip->i_mount;
3326         int             error = 0;
3327
3328         if (XFS_FORCED_SHUTDOWN(mp))
3329                 return XFS_ERROR(EIO);
3330
3331         /*
3332          * Bypass inodes which have already been cleaned by
3333          * the inode flush clustering code inside xfs_iflush
3334          */
3335         if (xfs_inode_clean(ip))
3336                 return 0;
3337
3338         /*
3339          * We make this non-blocking if the inode is contended,
3340          * return EAGAIN to indicate to the caller that they
3341          * did not succeed. This prevents the flush path from
3342          * blocking on inodes inside another operation right
3343          * now, they get caught later by xfs_sync.
3344          */
3345         if (flags & FLUSH_SYNC) {
3346                 xfs_ilock(ip, XFS_ILOCK_SHARED);
3347                 xfs_iflock(ip);
3348         } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3349                 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3350                         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3351                         return EAGAIN;
3352                 }
3353         } else {
3354                 return EAGAIN;
3355         }
3356
3357         error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
3358                                                     : XFS_IFLUSH_ASYNC_NOBLOCK);
3359         xfs_iunlock(ip, XFS_ILOCK_SHARED);
3360
3361         return error;
3362 }
3363
3364
3365 int
3366 xfs_set_dmattrs(
3367         xfs_inode_t     *ip,
3368         u_int           evmask,
3369         u_int16_t       state)
3370 {
3371         xfs_mount_t     *mp = ip->i_mount;
3372         xfs_trans_t     *tp;
3373         int             error;
3374
3375         if (!capable(CAP_SYS_ADMIN))
3376                 return XFS_ERROR(EPERM);
3377
3378         if (XFS_FORCED_SHUTDOWN(mp))
3379                 return XFS_ERROR(EIO);
3380
3381         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3382         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3383         if (error) {
3384                 xfs_trans_cancel(tp, 0);
3385                 return error;
3386         }
3387         xfs_ilock(ip, XFS_ILOCK_EXCL);
3388         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3389
3390         ip->i_d.di_dmevmask = evmask;
3391         ip->i_d.di_dmstate  = state;
3392
3393         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3394         IHOLD(ip);
3395         error = xfs_trans_commit(tp, 0);
3396
3397         return error;
3398 }
3399
3400 int
3401 xfs_reclaim(
3402         xfs_inode_t     *ip)
3403 {
3404         bhv_vnode_t     *vp = XFS_ITOV(ip);
3405
3406         xfs_itrace_entry(ip);
3407
3408         ASSERT(!VN_MAPPED(vp));
3409
3410         /* bad inode, get out here ASAP */
3411         if (VN_BAD(vp)) {
3412                 xfs_ireclaim(ip);
3413                 return 0;
3414         }
3415
3416         vn_iowait(ip);
3417
3418         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3419
3420         /*
3421          * Make sure the atime in the XFS inode is correct before freeing the
3422          * Linux inode.
3423          */
3424         xfs_synchronize_atime(ip);
3425
3426         /*
3427          * If we have nothing to flush with this inode then complete the
3428          * teardown now, otherwise break the link between the xfs inode and the
3429          * linux inode and clean up the xfs inode later. This avoids flushing
3430          * the inode to disk during the delete operation itself.
3431          *
3432          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3433          * first to ensure that xfs_iunpin() will never see an xfs inode
3434          * that has a linux inode being reclaimed. Synchronisation is provided
3435          * by the i_flags_lock.
3436          */
3437         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3438                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3439                 xfs_iflock(ip);
3440                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3441         } else {
3442                 xfs_mount_t     *mp = ip->i_mount;
3443
3444                 /* Protect sync and unpin from us */
3445                 XFS_MOUNT_ILOCK(mp);
3446                 spin_lock(&ip->i_flags_lock);
3447                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3448                 vn_to_inode(vp)->i_private = NULL;
3449                 ip->i_vnode = NULL;
3450                 spin_unlock(&ip->i_flags_lock);
3451                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3452                 XFS_MOUNT_IUNLOCK(mp);
3453         }
3454         return 0;
3455 }
3456
3457 int
3458 xfs_finish_reclaim(
3459         xfs_inode_t     *ip,
3460         int             locked,
3461         int             sync_mode)
3462 {
3463         xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
3464         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3465         int             error;
3466
3467         if (vp && VN_BAD(vp))
3468                 goto reclaim;
3469
3470         /* The hash lock here protects a thread in xfs_iget_core from
3471          * racing with us on linking the inode back with a vnode.
3472          * Once we have the XFS_IRECLAIM flag set it will not touch
3473          * us.
3474          */
3475         write_lock(&pag->pag_ici_lock);
3476         spin_lock(&ip->i_flags_lock);
3477         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3478             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3479                 spin_unlock(&ip->i_flags_lock);
3480                 write_unlock(&pag->pag_ici_lock);
3481                 if (locked) {
3482                         xfs_ifunlock(ip);
3483                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3484                 }
3485                 return 1;
3486         }
3487         __xfs_iflags_set(ip, XFS_IRECLAIM);
3488         spin_unlock(&ip->i_flags_lock);
3489         write_unlock(&pag->pag_ici_lock);
3490         xfs_put_perag(ip->i_mount, pag);
3491
3492         /*
3493          * If the inode is still dirty, then flush it out.  If the inode
3494          * is not in the AIL, then it will be OK to flush it delwri as
3495          * long as xfs_iflush() does not keep any references to the inode.
3496          * We leave that decision up to xfs_iflush() since it has the
3497          * knowledge of whether it's OK to simply do a delwri flush of
3498          * the inode or whether we need to wait until the inode is
3499          * pulled from the AIL.
3500          * We get the flush lock regardless, though, just to make sure
3501          * we don't free it while it is being flushed.
3502          */
3503         if (!locked) {
3504                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3505                 xfs_iflock(ip);
3506         }
3507
3508         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3509                 if (ip->i_update_core ||
3510                     ((ip->i_itemp != NULL) &&
3511                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3512                         error = xfs_iflush(ip, sync_mode);
3513                         /*
3514                          * If we hit an error, typically because of filesystem
3515                          * shutdown, we don't need to let vn_reclaim to know
3516                          * because we're gonna reclaim the inode anyway.
3517                          */
3518                         if (error) {
3519                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3520                                 goto reclaim;
3521                         }
3522                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3523                 }
3524
3525                 ASSERT(ip->i_update_core == 0);
3526                 ASSERT(ip->i_itemp == NULL ||
3527                        ip->i_itemp->ili_format.ilf_fields == 0);
3528         }
3529
3530         xfs_ifunlock(ip);
3531         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3532
3533  reclaim:
3534         xfs_ireclaim(ip);
3535         return 0;
3536 }
3537
3538 int
3539 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3540 {
3541         int             purged;
3542         xfs_inode_t     *ip, *n;
3543         int             done = 0;
3544
3545         while (!done) {
3546                 purged = 0;
3547                 XFS_MOUNT_ILOCK(mp);
3548                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3549                         if (noblock) {
3550                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3551                                         continue;
3552                                 if (xfs_ipincount(ip) ||
3553                                     !xfs_iflock_nowait(ip)) {
3554                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3555                                         continue;
3556                                 }
3557                         }
3558                         XFS_MOUNT_IUNLOCK(mp);
3559                         if (xfs_finish_reclaim(ip, noblock,
3560                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3561                                 delay(1);
3562                         purged = 1;
3563                         break;
3564                 }
3565
3566                 done = !purged;
3567         }
3568
3569         XFS_MOUNT_IUNLOCK(mp);
3570         return 0;
3571 }
3572
3573 /*
3574  * xfs_alloc_file_space()
3575  *      This routine allocates disk space for the given file.
3576  *
3577  *      If alloc_type == 0, this request is for an ALLOCSP type
3578  *      request which will change the file size.  In this case, no
3579  *      DMAPI event will be generated by the call.  A TRUNCATE event
3580  *      will be generated later by xfs_setattr.
3581  *
3582  *      If alloc_type != 0, this request is for a RESVSP type
3583  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3584  *      lower block boundary byte address is less than the file's
3585  *      length.
3586  *
3587  * RETURNS:
3588  *       0 on success
3589  *      errno on error
3590  *
3591  */
3592 STATIC int
3593 xfs_alloc_file_space(
3594         xfs_inode_t             *ip,
3595         xfs_off_t               offset,
3596         xfs_off_t               len,
3597         int                     alloc_type,
3598         int                     attr_flags)
3599 {
3600         xfs_mount_t             *mp = ip->i_mount;
3601         xfs_off_t               count;
3602         xfs_filblks_t           allocated_fsb;
3603         xfs_filblks_t           allocatesize_fsb;
3604         xfs_extlen_t            extsz, temp;
3605         xfs_fileoff_t           startoffset_fsb;
3606         xfs_fsblock_t           firstfsb;
3607         int                     nimaps;
3608         int                     bmapi_flag;
3609         int                     quota_flag;
3610         int                     rt;
3611         xfs_trans_t             *tp;
3612         xfs_bmbt_irec_t         imaps[1], *imapp;
3613         xfs_bmap_free_t         free_list;
3614         uint                    qblocks, resblks, resrtextents;
3615         int                     committed;
3616         int                     error;
3617
3618         xfs_itrace_entry(ip);
3619
3620         if (XFS_FORCED_SHUTDOWN(mp))
3621                 return XFS_ERROR(EIO);
3622
3623         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3624                 return error;
3625
3626         if (len <= 0)
3627                 return XFS_ERROR(EINVAL);
3628
3629         rt = XFS_IS_REALTIME_INODE(ip);
3630         extsz = xfs_get_extsz_hint(ip);
3631
3632         count = len;
3633         imapp = &imaps[0];
3634         nimaps = 1;
3635         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
3636         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
3637         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
3638
3639         /*      Generate a DMAPI event if needed.       */
3640         if (alloc_type != 0 && offset < ip->i_size &&
3641                         (attr_flags&ATTR_DMI) == 0  &&
3642                         DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3643                 xfs_off_t           end_dmi_offset;
3644
3645                 end_dmi_offset = offset+len;
3646                 if (end_dmi_offset > ip->i_size)
3647                         end_dmi_offset = ip->i_size;
3648                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
3649                                       end_dmi_offset - offset, 0, NULL);
3650                 if (error)
3651                         return error;
3652         }
3653
3654         /*
3655          * Allocate file space until done or until there is an error
3656          */
3657 retry:
3658         while (allocatesize_fsb && !error) {
3659                 xfs_fileoff_t   s, e;
3660
3661                 /*
3662                  * Determine space reservations for data/realtime.
3663                  */
3664                 if (unlikely(extsz)) {
3665                         s = startoffset_fsb;
3666                         do_div(s, extsz);
3667                         s *= extsz;
3668                         e = startoffset_fsb + allocatesize_fsb;
3669                         if ((temp = do_mod(startoffset_fsb, extsz)))
3670                                 e += temp;
3671                         if ((temp = do_mod(e, extsz)))
3672                                 e += extsz - temp;
3673                 } else {
3674                         s = 0;
3675                         e = allocatesize_fsb;
3676                 }
3677
3678                 if (unlikely(rt)) {
3679                         resrtextents = qblocks = (uint)(e - s);
3680                         resrtextents /= mp->m_sb.sb_rextsize;
3681                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3682                         quota_flag = XFS_QMOPT_RES_RTBLKS;
3683                 } else {
3684                         resrtextents = 0;
3685                         resblks = qblocks = \
3686                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
3687                         quota_flag = XFS_QMOPT_RES_REGBLKS;
3688                 }
3689
3690                 /*
3691                  * Allocate and setup the transaction.
3692                  */
3693                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3694                 error = xfs_trans_reserve(tp, resblks,
3695                                           XFS_WRITE_LOG_RES(mp), resrtextents,
3696                                           XFS_TRANS_PERM_LOG_RES,
3697                                           XFS_WRITE_LOG_COUNT);
3698                 /*
3699                  * Check for running out of space
3700                  */
3701                 if (error) {
3702                         /*
3703                          * Free the transaction structure.
3704                          */
3705                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
3706                         xfs_trans_cancel(tp, 0);
3707                         break;
3708                 }
3709                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3710                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
3711                                                       qblocks, 0, quota_flag);
3712                 if (error)
3713                         goto error1;
3714
3715                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3716                 xfs_trans_ihold(tp, ip);
3717
3718                 /*
3719                  * Issue the xfs_bmapi() call to allocate the blocks
3720                  */
3721                 XFS_BMAP_INIT(&free_list, &firstfsb);
3722                 error = xfs_bmapi(tp, ip, startoffset_fsb,
3723                                   allocatesize_fsb, bmapi_flag,
3724                                   &firstfsb, 0, imapp, &nimaps,
3725                                   &free_list, NULL);
3726                 if (error) {
3727                         goto error0;
3728                 }
3729
3730                 /*
3731                  * Complete the transaction
3732                  */
3733                 error = xfs_bmap_finish(&tp, &free_list, &committed);
3734                 if (error) {
3735                         goto error0;
3736                 }
3737
3738                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3739                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3740                 if (error) {
3741                         break;
3742                 }
3743
3744                 allocated_fsb = imapp->br_blockcount;
3745
3746                 if (nimaps == 0) {
3747                         error = XFS_ERROR(ENOSPC);
3748                         break;
3749                 }
3750
3751                 startoffset_fsb += allocated_fsb;
3752                 allocatesize_fsb -= allocated_fsb;
3753         }
3754 dmapi_enospc_check:
3755         if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
3756             DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
3757                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
3758                                 ip, DM_RIGHT_NULL,
3759                                 ip, DM_RIGHT_NULL,
3760                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
3761                 if (error == 0)
3762                         goto retry;     /* Maybe DMAPI app. has made space */
3763                 /* else fall through with error from XFS_SEND_DATA */
3764         }
3765
3766         return error;
3767
3768 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
3769         xfs_bmap_cancel(&free_list);
3770         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
3771
3772 error1: /* Just cancel transaction */
3773         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
3774         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3775         goto dmapi_enospc_check;
3776 }
3777
3778 /*
3779  * Zero file bytes between startoff and endoff inclusive.
3780  * The iolock is held exclusive and no blocks are buffered.
3781  */
3782 STATIC int
3783 xfs_zero_remaining_bytes(
3784         xfs_inode_t             *ip,
3785         xfs_off_t               startoff,
3786         xfs_off_t               endoff)
3787 {
3788         xfs_bmbt_irec_t         imap;
3789         xfs_fileoff_t           offset_fsb;
3790         xfs_off_t               lastoffset;
3791         xfs_off_t               offset;
3792         xfs_buf_t               *bp;
3793         xfs_mount_t             *mp = ip->i_mount;
3794         int                     nimap;
3795         int                     error = 0;
3796
3797         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3798                                 XFS_IS_REALTIME_INODE(ip) ?
3799                                 mp->m_rtdev_targp : mp->m_ddev_targp);
3800
3801         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
3802                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
3803                 nimap = 1;
3804                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
3805                         NULL, 0, &imap, &nimap, NULL, NULL);
3806                 if (error || nimap < 1)
3807                         break;
3808                 ASSERT(imap.br_blockcount >= 1);
3809                 ASSERT(imap.br_startoff == offset_fsb);
3810                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
3811                 if (lastoffset > endoff)
3812                         lastoffset = endoff;
3813                 if (imap.br_startblock == HOLESTARTBLOCK)
3814                         continue;
3815                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3816                 if (imap.br_state == XFS_EXT_UNWRITTEN)
3817                         continue;
3818                 XFS_BUF_UNDONE(bp);
3819                 XFS_BUF_UNWRITE(bp);
3820                 XFS_BUF_READ(bp);
3821                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
3822                 xfsbdstrat(mp, bp);
3823                 error = xfs_iowait(bp);
3824                 if (error) {
3825                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
3826                                           mp, bp, XFS_BUF_ADDR(bp));
3827                         break;
3828                 }
3829                 memset(XFS_BUF_PTR(bp) +
3830                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
3831                       0, lastoffset - offset + 1);
3832                 XFS_BUF_UNDONE(bp);
3833                 XFS_BUF_UNREAD(bp);
3834                 XFS_BUF_WRITE(bp);
3835                 xfsbdstrat(mp, bp);
3836                 error = xfs_iowait(bp);
3837                 if (error) {
3838                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
3839                                           mp, bp, XFS_BUF_ADDR(bp));
3840                         break;
3841                 }
3842         }
3843         xfs_buf_free(bp);
3844         return error;
3845 }
3846
3847 /*
3848  * xfs_free_file_space()
3849  *      This routine frees disk space for the given file.
3850  *
3851  *      This routine is only called by xfs_change_file_space
3852  *      for an UNRESVSP type call.
3853  *
3854  * RETURNS:
3855  *       0 on success
3856  *      errno on error
3857  *
3858  */
3859 STATIC int
3860 xfs_free_file_space(
3861         xfs_inode_t             *ip,
3862         xfs_off_t               offset,
3863         xfs_off_t               len,
3864         int                     attr_flags)
3865 {
3866         bhv_vnode_t             *vp;
3867         int                     committed;
3868         int                     done;
3869         xfs_off_t               end_dmi_offset;
3870         xfs_fileoff_t           endoffset_fsb;
3871         int                     error;
3872         xfs_fsblock_t           firstfsb;
3873         xfs_bmap_free_t         free_list;
3874         xfs_bmbt_irec_t         imap;
3875         xfs_off_t               ioffset;
3876         xfs_extlen_t            mod=0;
3877         xfs_mount_t             *mp;
3878         int                     nimap;
3879         uint                    resblks;
3880         uint                    rounding;
3881         int                     rt;
3882         xfs_fileoff_t           startoffset_fsb;
3883         xfs_trans_t             *tp;
3884         int                     need_iolock = 1;
3885
3886         vp = XFS_ITOV(ip);
3887         mp = ip->i_mount;
3888
3889         xfs_itrace_entry(ip);
3890
3891         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
3892                 return error;
3893
3894         error = 0;
3895         if (len <= 0)   /* if nothing being freed */
3896                 return error;
3897         rt = XFS_IS_REALTIME_INODE(ip);
3898         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
3899         end_dmi_offset = offset + len;
3900         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
3901
3902         if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
3903             DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
3904                 if (end_dmi_offset > ip->i_size)
3905                         end_dmi_offset = ip->i_size;
3906                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
3907                                 offset, end_dmi_offset - offset,
3908                                 AT_DELAY_FLAG(attr_flags), NULL);
3909                 if (error)
3910                         return error;
3911         }
3912
3913         if (attr_flags & ATTR_NOLOCK)
3914                 need_iolock = 0;
3915         if (need_iolock) {
3916                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3917                 vn_iowait(ip);  /* wait for the completion of any pending DIOs */
3918         }
3919
3920         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
3921         ioffset = offset & ~(rounding - 1);
3922
3923         if (VN_CACHED(vp) != 0) {
3924                 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
3925                 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
3926                 if (error)
3927                         goto out_unlock_iolock;
3928         }
3929
3930         /*
3931          * Need to zero the stuff we're not freeing, on disk.
3932          * If its a realtime file & can't use unwritten extents then we
3933          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
3934          * will take care of it for us.
3935          */
3936         if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
3937                 nimap = 1;
3938                 error = xfs_bmapi(NULL, ip, startoffset_fsb,
3939                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
3940                 if (error)
3941                         goto out_unlock_iolock;
3942                 ASSERT(nimap == 0 || nimap == 1);
3943                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
3944                         xfs_daddr_t     block;
3945
3946                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3947                         block = imap.br_startblock;
3948                         mod = do_div(block, mp->m_sb.sb_rextsize);
3949                         if (mod)
3950                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
3951                 }
3952                 nimap = 1;
3953                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
3954                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
3955                 if (error)
3956                         goto out_unlock_iolock;
3957                 ASSERT(nimap == 0 || nimap == 1);
3958                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
3959                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
3960                         mod++;
3961                         if (mod && (mod != mp->m_sb.sb_rextsize))
3962                                 endoffset_fsb -= mod;
3963                 }
3964         }
3965         if ((done = (endoffset_fsb <= startoffset_fsb)))
3966                 /*
3967                  * One contiguous piece to clear
3968                  */
3969                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
3970         else {
3971                 /*
3972                  * Some full blocks, possibly two pieces to clear
3973                  */
3974                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
3975                         error = xfs_zero_remaining_bytes(ip, offset,
3976                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
3977                 if (!error &&
3978                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
3979                         error = xfs_zero_remaining_bytes(ip,
3980                                 XFS_FSB_TO_B(mp, endoffset_fsb),
3981                                 offset + len - 1);
3982         }
3983
3984         /*
3985          * free file space until done or until there is an error
3986          */
3987         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
3988         while (!error && !done) {
3989
3990                 /*
3991                  * allocate and setup the transaction. Allow this
3992                  * transaction to dip into the reserve blocks to ensure
3993                  * the freeing of the space succeeds at ENOSPC.
3994                  */
3995                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
3996                 tp->t_flags |= XFS_TRANS_RESERVE;
3997                 error = xfs_trans_reserve(tp,
3998                                           resblks,
3999                                           XFS_WRITE_LOG_RES(mp),
4000                                           0,
4001                                           XFS_TRANS_PERM_LOG_RES,
4002                                           XFS_WRITE_LOG_COUNT);
4003
4004                 /*
4005                  * check for running out of space
4006                  */
4007                 if (error) {
4008                         /*
4009                          * Free the transaction structure.
4010                          */
4011                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4012                         xfs_trans_cancel(tp, 0);
4013                         break;
4014                 }
4015                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4016                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4017                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4018                                 XFS_QMOPT_RES_REGBLKS);
4019                 if (error)
4020                         goto error1;
4021
4022                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4023                 xfs_trans_ihold(tp, ip);
4024
4025                 /*
4026                  * issue the bunmapi() call to free the blocks
4027                  */
4028                 XFS_BMAP_INIT(&free_list, &firstfsb);
4029                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
4030                                   endoffset_fsb - startoffset_fsb,
4031                                   0, 2, &firstfsb, &free_list, NULL, &done);
4032                 if (error) {
4033                         goto error0;
4034                 }
4035
4036                 /*
4037                  * complete the transaction
4038                  */
4039                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4040                 if (error) {
4041                         goto error0;
4042                 }
4043
4044                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4045                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4046         }
4047
4048  out_unlock_iolock:
4049         if (need_iolock)
4050                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4051         return error;
4052
4053  error0:
4054         xfs_bmap_cancel(&free_list);
4055  error1:
4056         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4057         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4058                     XFS_ILOCK_EXCL);
4059         return error;
4060 }
4061
4062 /*
4063  * xfs_change_file_space()
4064  *      This routine allocates or frees disk space for the given file.
4065  *      The user specified parameters are checked for alignment and size
4066  *      limitations.
4067  *
4068  * RETURNS:
4069  *       0 on success
4070  *      errno on error
4071  *
4072  */
4073 int
4074 xfs_change_file_space(
4075         xfs_inode_t     *ip,
4076         int             cmd,
4077         xfs_flock64_t   *bf,
4078         xfs_off_t       offset,
4079         cred_t          *credp,
4080         int             attr_flags)
4081 {
4082         xfs_mount_t     *mp = ip->i_mount;
4083         int             clrprealloc;
4084         int             error;
4085         xfs_fsize_t     fsize;
4086         int             setprealloc;
4087         xfs_off_t       startoffset;
4088         xfs_off_t       llen;
4089         xfs_trans_t     *tp;
4090         bhv_vattr_t     va;
4091
4092         xfs_itrace_entry(ip);
4093
4094         if (!S_ISREG(ip->i_d.di_mode))
4095                 return XFS_ERROR(EINVAL);
4096
4097         switch (bf->l_whence) {
4098         case 0: /*SEEK_SET*/
4099                 break;
4100         case 1: /*SEEK_CUR*/
4101                 bf->l_start += offset;
4102                 break;
4103         case 2: /*SEEK_END*/
4104                 bf->l_start += ip->i_size;
4105                 break;
4106         default:
4107                 return XFS_ERROR(EINVAL);
4108         }
4109
4110         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4111
4112         if (   (bf->l_start < 0)
4113             || (bf->l_start > XFS_MAXIOFFSET(mp))
4114             || (bf->l_start + llen < 0)
4115             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4116                 return XFS_ERROR(EINVAL);
4117
4118         bf->l_whence = 0;
4119
4120         startoffset = bf->l_start;
4121         fsize = ip->i_size;
4122
4123         /*
4124          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4125          * file space.
4126          * These calls do NOT zero the data space allocated to the file,
4127          * nor do they change the file size.
4128          *
4129          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4130          * space.
4131          * These calls cause the new file data to be zeroed and the file
4132          * size to be changed.
4133          */
4134         setprealloc = clrprealloc = 0;
4135
4136         switch (cmd) {
4137         case XFS_IOC_RESVSP:
4138         case XFS_IOC_RESVSP64:
4139                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4140                                                                 1, attr_flags);
4141                 if (error)
4142                         return error;
4143                 setprealloc = 1;
4144                 break;
4145
4146         case XFS_IOC_UNRESVSP:
4147         case XFS_IOC_UNRESVSP64:
4148                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4149                                                                 attr_flags)))
4150                         return error;
4151                 break;
4152
4153         case XFS_IOC_ALLOCSP:
4154         case XFS_IOC_ALLOCSP64:
4155         case XFS_IOC_FREESP:
4156         case XFS_IOC_FREESP64:
4157                 if (startoffset > fsize) {
4158                         error = xfs_alloc_file_space(ip, fsize,
4159                                         startoffset - fsize, 0, attr_flags);
4160                         if (error)
4161                                 break;
4162                 }
4163
4164                 va.va_mask = XFS_AT_SIZE;
4165                 va.va_size = startoffset;
4166
4167                 error = xfs_setattr(ip, &va, attr_flags, credp);
4168
4169                 if (error)
4170                         return error;
4171
4172                 clrprealloc = 1;
4173                 break;
4174
4175         default:
4176                 ASSERT(0);
4177                 return XFS_ERROR(EINVAL);
4178         }
4179
4180         /*
4181          * update the inode timestamp, mode, and prealloc flag bits
4182          */
4183         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4184
4185         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4186                                       0, 0, 0))) {
4187                 /* ASSERT(0); */
4188                 xfs_trans_cancel(tp, 0);
4189                 return error;
4190         }
4191
4192         xfs_ilock(ip, XFS_ILOCK_EXCL);
4193
4194         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4195         xfs_trans_ihold(tp, ip);
4196
4197         if ((attr_flags & ATTR_DMI) == 0) {
4198                 ip->i_d.di_mode &= ~S_ISUID;
4199
4200                 /*
4201                  * Note that we don't have to worry about mandatory
4202                  * file locking being disabled here because we only
4203                  * clear the S_ISGID bit if the Group execute bit is
4204                  * on, but if it was on then mandatory locking wouldn't
4205                  * have been enabled.
4206                  */
4207                 if (ip->i_d.di_mode & S_IXGRP)
4208                         ip->i_d.di_mode &= ~S_ISGID;
4209
4210                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4211         }
4212         if (setprealloc)
4213                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4214         else if (clrprealloc)
4215                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4216
4217         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4218         xfs_trans_set_sync(tp);
4219
4220         error = xfs_trans_commit(tp, 0);
4221
4222         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4223
4224         return error;
4225 }