04/21/04 16:54:59 sys/sys/bio.h 1 1 /* 65 void *bio_caller2; /* Private use by the caller. */ 2 * Copyright (c) 1982, 1986, 1989, 1993 66 TAILQ_ENTRY(bio) bio_queue; /* Disksort queue. */ 3 * The Regents of the University of California. All rights reserved. 67 const char *bio_attribute; /* Attribute for BIO_[GS]ETATTR */ 4 * (c) UNIX System Laboratories, Inc. 68 struct g_consumer *bio_from; /* GEOM linkage */ 5 * All or some portions of this file are derived from material licensed 69 struct g_provider *bio_to; /* GEOM linkage */ 6 * to the University of California by American Telephone and Telegraph 70 off_t bio_length; /* Like bio_bcount */ 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 71 off_t bio_completed; /* Inverse of bio_resid */ 8 * the permission of UNIX System Laboratories, Inc. 72 u_int bio_children; /* Number of spawned bios */ 9 * 73 u_int bio_inbed; /* Children safely home by now */ 10 * Redistribution and use in source and binary forms, with or without 74 struct bio *bio_parent; /* Pointer to parent */ 11 * modification, are permitted provided that the following conditions 75 struct bintime bio_t0; /* Time request started */ 12 * are met: 76 13 * 1. Redistributions of source code must retain the above copyright 77 bio_task_t *bio_task; /* Task_queue handler */ 14 * notice, this list of conditions and the following disclaimer. 78 void *bio_task_arg; /* Argument to above */ 15 * 2. Redistributions in binary form must reproduce the above copyright 79 16 * notice, this list of conditions and the following disclaimer in the 80 /* XXX: these go away when bio chaining is introduced */ 17 * documentation and/or other materials provided with the distribution. 81 daddr_t bio_pblkno; /* physical block number */ 18 * 3. Neither the name of the University nor the names of its contributors 82 }; 19 * may be used to endorse or promote products derived from this software 83 20 * without specific prior written permission. 84 /* bio_cmd */ 21 * 85 #define BIO_READ 0x00000001 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 86 #define BIO_WRITE 0x00000002 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 #define BIO_DELETE 0x00000004 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 #define BIO_GETATTR 0x00000008 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 89 #define BIO_CMD1 0x40000000 /* Available for local hacks */ 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 #define BIO_CMD2 0x80000000 /* Available for local hacks */ 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 /* bio_flags */ 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 #define BIO_ERROR 0x00000001 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 #define BIO_DONE 0x00000004 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 #define BIO_FLAG2 0x40000000 /* Available for local hacks */ 32 * SUCH DAMAGE. 96 #define BIO_FLAG1 0x80000000 /* Available for local hacks */ 33 * 97 34 * @(#)buf.h 8.9 (Berkeley) 3/30/95 98 #ifdef _KERNEL 35 * $FreeBSD: src/sys/sys/bio.h,v 1.135.2.1 2004/02/11 08:31:23 scottl Exp $ 99 36 */ 100 struct uio; 37 101 struct devstat; 38 #ifndef _SYS_BIO_H_ 102 39 #define _SYS_BIO_H_ 103 struct bio_queue_head { 40 104 TAILQ_HEAD(bio_queue, bio) queue; 41 #include 105 off_t last_offset; 42 106 struct bio *insert_point; 43 struct disk; 107 struct bio *switch_point; 44 struct bio; 108 }; 45 109 46 typedef void bio_task_t(void *); 110 void biodone(struct bio *bp); 47 111 void biofinish(struct bio *bp, struct devstat *stat, int error); 48 /* 112 int biowait(struct bio *bp, const char *wchan); 49 * The bio structure describes an I/O operation in the kernel. 113 50 */ 114 void bioq_disksort(struct bio_queue_head *ap, struct bio *bp); 51 struct bio { 115 struct bio *bioq_first(struct bio_queue_head *head); 52 u_int bio_cmd; /* I/O operation. */ 116 void bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error); 53 dev_t bio_dev; /* Device to do I/O on. */ 117 void bioq_init(struct bio_queue_head *head); 54 struct disk *bio_disk; /* Valid below geom_disk.c only */ 118 void bioq_insert_tail(struct bio_queue_head *head, struct bio *bp); 55 off_t bio_offset; /* Offset into file. */ 119 void bioq_remove(struct bio_queue_head *head, struct bio *bp); 56 long bio_bcount; /* Valid bytes in buffer. */ 120 57 caddr_t bio_data; /* Memory, superblocks, indirect etc. 121 void bio_taskqueue(struct bio *bp, bio_task_t *fund, void *arg); */ 122 58 u_int bio_flags; /* BIO_ flags. */ 123 int physio(dev_t dev, struct uio *uio, int ioflag); 59 int bio_error; /* Errno for BIO_ERROR. */ 124 #define physread physio 60 long bio_resid; /* Remaining I/0 in bytes. */ 125 #define physwrite physio 61 void (*bio_done)(struct bio *); 126 62 void *bio_driver1; /* Private use by the callee. */ 127 #endif /* _KERNEL */ 63 void *bio_driver2; /* Private use by the callee. */ 128 64 void *bio_caller1; /* Private use by the caller. */ 129 #endif /* !_SYS_BIO_H_ */ 11/15/03 01:28:09 sys/sys/buf.h 1 1 /* 66 void (*io_complete)(struct buf *); 2 * Copyright (c) 1982, 1986, 1989, 1993 67 void (*io_deallocate)(struct buf *); 3 * The Regents of the University of California. All rights reserved. 68 void (*io_movedeps)(struct buf *, struct buf *); 4 * (c) UNIX System Laboratories, Inc. 69 int (*io_countdeps)(struct buf *, int); 5 * All or some portions of this file are derived from material licensed 70 } bioops; 6 * to the University of California by American Telephone and Telegraph 71 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 72 struct buf_ops { 8 * the permission of UNIX System Laboratories, Inc. 73 char *bop_name; 9 * 74 int (*bop_write)(struct buf *); 10 * Redistribution and use in source and binary forms, with or without 75 }; 11 * modification, are permitted provided that the following conditions 76 12 * are met: 77 extern struct buf_ops buf_ops_bio; 13 * 1. Redistributions of source code must retain the above copyright 78 14 * notice, this list of conditions and the following disclaimer. 79 struct vm_object; 15 * 2. Redistributions in binary form must reproduce the above copyright 80 16 * notice, this list of conditions and the following disclaimer in the 81 typedef unsigned char b_xflags_t; 17 * documentation and/or other materials provided with the distribution. 82 18 * 3. All advertising materials mentioning features or use of this software 83 /* 19 * must display the following acknowledgement: 84 * The buffer header describes an I/O operation in the kernel. 20 * This product includes software developed by the University of 85 * 21 * California, Berkeley and its contributors. 86 * NOTES: 22 * 4. Neither the name of the University nor the names of its contributors 87 * b_bufsize, b_bcount. b_bufsize is the allocation size of the 23 * may be used to endorse or promote products derived from this software 88 * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the 24 * without specific prior written permission. 89 * originally requested buffer size and can serve as a bounds check 25 * 90 * against EOF. For most, but not all uses, b_bcount == b_bufsize. 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 91 * 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 92 * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 93 * ranges of dirty data that need to be written to backing store. 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 94 * The range is typically clipped at b_bcount ( not b_bufsize ). 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 95 * 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 96 * b_resid. Number of bytes remaining in I/O. After an I/O operation 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 97 * completes, b_resid is usually 0 indicating 100% success. 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 98 * 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 99 * All fields are protected by the buffer lock except those marked: 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 100 * V - Protected by owning vnode lock 36 * SUCH DAMAGE. 101 * Q - Protected by the buf queue lock 37 * 102 * D - Protected by an dependency implementation specific lock 38 * @(#)buf.h 8.9 (Berkeley) 3/30/95 103 */ 39 * $FreeBSD: src/sys/sys/buf.h,v 1.162 2003/11/15 09:28:09 phk Exp $ 104 struct buf { 40 */ 105 struct bio b_io; /* "Builtin" I/O request. */ 41 106 #define b_bcount b_io.bio_bcount 42 #ifndef _SYS_BUF_H_ 107 #define b_caller1 b_io.bio_caller1 43 #define _SYS_BUF_H_ 108 #define b_data b_io.bio_data 44 109 #define b_dev b_io.bio_dev 45 #include 110 #define b_error b_io.bio_error 46 #include 111 #define b_iocmd b_io.bio_cmd 47 #include 112 #define b_ioflags b_io.bio_flags 48 113 #define b_iooffset b_io.bio_offset 49 struct bio; 114 #define b_resid b_io.bio_resid 50 struct buf; 115 struct buf_ops *b_op; 51 struct mount; 116 unsigned b_magic; 52 struct vnode; 117 #define B_MAGIC_BIO 0x10b10b10 53 118 #define B_MAGIC_NFS 0x67238234 54 /* 119 void (*b_iodone)(struct buf *); 55 * To avoid including 120 daddr_t b_blkno; /* Underlying physical block number. * 56 */ / 57 LIST_HEAD(workhead, worklist); 121 off_t b_offset; /* Offset into file. */ 58 /* 122 TAILQ_ENTRY(buf) b_vnbufs; /* (V) Buffer’s associated vnode. */ 59 * These are currently used only by the soft dependency code, hence 123 struct buf *b_left; /* (V) splay tree link */ 60 * are stored once in a global variable. If other subsystems wanted 124 struct buf *b_right; /* (V) splay tree link */ 61 * to use these hooks, a pointer to a set of bio_ops could be added 125 uint32_t b_vflags; /* (V) BV_* flags */ 62 * to each buffer. 126 TAILQ_ENTRY(buf) b_freelist; /* (Q) Free list position inactive. */ 63 */ 127 unsigned short b_qindex; /* (Q) buffer queue index */ 64 extern struct bio_ops { 128 uint32_t b_flags; /* B_* flags. */ 65 void (*io_start)(struct buf *); 129 b_xflags_t b_xflags; /* extra flags */ 11/15/03 01:28:09 sys/sys/buf.h 2 130 struct lock b_lock; /* Buffer lock */ 190 * 131 long b_bufsize; /* Allocated buffer size. */ 191 * B_VMIO Indicates that the buffer is tied into an VM object. 132 long b_runningbufspace; /* when I/O is running, pipelining */ 192 * The buffer’s data is always PAGE_SIZE aligned even 133 caddr_t b_kvabase; /* base kva for buffer */ 193 * if b_bufsize and b_bcount are not. ( b_bufsize is 134 int b_kvasize; /* size of kva for buffer */ 194 * always at least DEV_BSIZE aligned, though ). 135 daddr_t b_lblkno; /* Logical block number. */ 195 * 136 struct vnode *b_vp; /* Device vnode. */ 196 * B_DIRECT Hint that we should attempt to completely free 137 struct vm_object *b_object; /* Object for vp */ 197 * the pages underlying the buffer. B_DIRECT is 138 int b_dirtyoff; /* Offset in buffer of dirty region. * 198 * sticky until the buffer is released and typically / 199 * only has an effect when B_RELBUF is also set. 139 int b_dirtyend; /* Offset of end of dirty region. */ 200 * 140 struct ucred *b_rcred; /* Read credentials reference. */ 201 */ 141 struct ucred *b_wcred; /* Write credentials reference. */ 202 142 void *b_saveaddr; /* Original b_addr for physio. */ 203 #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ 143 union pager_info { 204 #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ 144 int pg_reqpage; 205 #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ 145 } b_pager; 206 #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ 146 union cluster_info { 207 #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ 147 TAILQ_HEAD(cluster_list_head, buf) cluster_head; 208 #define B_CACHE 0x00000020 /* Bread found us in the cache. */ 148 TAILQ_ENTRY(buf) cluster_entry; 209 #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ 149 } b_cluster; 210 #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ 150 struct vm_page *b_pages[btoc(MAXPHYS)]; 211 #define B_00000100 0x00000100 /* Available flag. */ 151 int b_npages; 212 #define B_DONE 0x00000200 /* I/O completed. */ 152 struct workhead b_dep; /* (D) List of filesystem dependencies 213 #define B_EINTR 0x00000400 /* I/O was interrupted */ . */ 214 #define B_00000800 0x00000800 /* Available flag. */ 153 }; 215 #define B_00001000 0x00001000 /* Available flag. */ 154 216 #define B_INVAL 0x00002000 /* Does not contain valid info. */ 155 /* 217 #define B_00004000 0x00004000 /* Available flag. */ 156 * These flags are kept in b_flags. 218 #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ 157 * 219 #define B_MALLOC 0x00010000 /* malloced b_data */ 158 * Notes: 220 #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. 159 * */ 160 * B_ASYNC VOP calls on bp’s are usually async whether or not 221 #define B_000400000 0x00040000 /* Available flag. */ 161 * B_ASYNC is set, but some subsystems, such as NFS, like 222 #define B_000800000 0x00080000 /* Available flag. */ 223 #define B_00100000 0x00100000 /* Available flag. */ 162 * to know what is best for the caller so they can 224 #define B_DIRTY 0x00200000 /* Needs writing later (in EXT2FS). */ 163 * optimize the I/O. 225 #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ 164 * 226 #define B_00800000 0x00800000 /* Available flag. */ 165 * B_PAGING Indicates that bp is being used by the paging system o 227 #define B_WRITEINPROG 0x01000000 /* Write in progress. */ r 228 #define B_02000000 0x02000000 /* Available flag. */ 166 * some paging system and that the bp is not linked into 229 #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO 167 * the b_vp’s clean/dirty linked lists or ref counts. */ 168 * Buffer vp reassignments are illegal in this case. 230 #define B_08000000 0x08000000 /* Available flag. */ 169 * 231 #define B_RAM 0x10000000 /* Read ahead mark (flag) */ 170 * B_CACHE This may only be set if the buffer is entirely valid. 232 #define B_VMIO 0x20000000 /* VMIO flag */ 171 * The situation where B_DELWRI is set and B_CACHE is 233 #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it * 172 * clear MUST be committed to disk by getblk() so / 173 * B_DELWRI can also be cleared. See the comments for 234 #define B_80000000 0x80000000 /* Available flag. */ 174 * getblk() in kern/vfs_bio.c. If B_CACHE is clear, 235 175 * the caller is expected to clear BIO_ERROR and B_INVAL, 236 #define PRINT_BUF_FLAGS "\20\40b31\37cluster\36vmio\35ram\34b27" \ 176 * set BIO_READ, and initiate an I/O. 237 "\33paging\32b25\31writeinprog\30b23\27relbuf\26dirty\25b20" \ 177 * 238 "\24b19\23phys\22clusterok\21malloc\20nocache\17locked\16inval" \ 178 * The ’entire buffer’ is defined to be the range from 239 "\15scanned\14nowdrain\13eintr\12done\11b8\10delwri\7validsuspwrt" \ 179 * 0 through b_bcount. 240 "\6cache\5deferred\4direct\3async\2needcommit\1age" 180 * 241 181 * B_MALLOC Request that the buffer be allocated from the malloc 242 /* 182 * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. 243 * These flags are kept in b_xflags. 183 * 244 */ 184 * B_CLUSTEROK This flag is typically set for B_DELWRI buffers 245 #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ 185 * by filesystems that allow clustering when the buffer 246 #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ 186 * is fully dirty and indicates that it may be clustered 247 #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ 187 * with other adjacent dirty buffers. Note the clusterin 248 #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ g 249 #define BX_ALTDATA 0x00000040 /* Holds extended data */ 188 * may not be used with the stage 1 data write under NFS 250 189 * but may be used for the commit rpc portion. 251 #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ 11/15/03 01:28:09 sys/sys/buf.h 3 252 316 * it has been handed off to biodone. 253 /* 317 */ 254 * These flags are kept in b_vflags. 318 static __inline void BUF_UNLOCK(struct buf *); 255 */ 319 static __inline void 256 #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs * 320 BUF_UNLOCK(struct buf *bp) / 321 { 257 #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ 322 int s; 258 #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ 323 259 324 s = splbio(); 260 #ifdef _KERNEL 325 lockmgr(&(bp)->b_lock, LK_RELEASE, NULL, curthread); 261 /* 326 splx(s); 262 * Buffer locking 327 } 263 */ 328 264 extern const char *buf_wmesg; /* Default buffer lock message */ 329 /* 265 #define BUF_WMESG "bufwait" 330 * Free a buffer lock. 266 #include /* XXX for curthread */ 331 */ 267 #include 332 #define BUF_LOCKFREE(bp) \ 268 333 do { \ 269 /* 334 if (BUF_REFCNT(bp) > 0) \ 270 * Initialize a lock. 335 panic("free locked buf"); \ 271 */ 336 lockdestroy(&(bp)->b_lock); \ 272 #define BUF_LOCKINIT(bp) \ 337 } while (0) 273 lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) 338 274 /* 339 #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ 275 * 340 /* 276 * Get a lock sleeping non-interruptably until it becomes available. 341 * When initiating asynchronous I/O, change ownership of the lock to the 277 */ 342 * kernel. Once done, the lock may legally released by biodone. The 278 static __inline int BUF_LOCK(struct buf *, int, struct mtx *); 343 * original owning process can no longer acquire it recursively, but must 279 static __inline int 344 * wait until the I/O is completed and the lock has been freed by biodone. 280 BUF_LOCK(struct buf *bp, int locktype, struct mtx *interlock) 345 */ 281 { 346 static __inline void BUF_KERNPROC(struct buf *); 282 int s, ret; 347 static __inline void 283 348 BUF_KERNPROC(struct buf *bp) 284 s = splbio(); 349 { 285 mtx_lock(bp->b_lock.lk_interlock); 350 struct thread *td = curthread; 286 locktype |= LK_INTERNAL; 351 287 bp->b_lock.lk_wmesg = buf_wmesg; 352 if ((td != PCPU_GET(idlethread)) 288 bp->b_lock.lk_prio = PRIBIO + 4; 353 && bp->b_lock.lk_lockholder == td) 289 ret = lockmgr(&(bp)->b_lock, locktype, interlock, curthread); 354 td->td_locks--; 290 splx(s); 355 bp->b_lock.lk_lockholder = LK_KERNPROC; 291 return ret; 356 } 292 } 357 #endif 293 /* 358 /* 294 * Get a lock sleeping with specified interruptably and timeout. 359 * Find out the number of references to a lock. 295 */ 360 */ 296 static __inline int BUF_TIMELOCK(struct buf *, int, struct mtx *, 361 static __inline int BUF_REFCNT(struct buf *); 297 char *, int, int); 362 static __inline int 298 static __inline int 363 BUF_REFCNT(struct buf *bp) 299 BUF_TIMELOCK(struct buf *bp, int locktype, struct mtx *interlock, 364 { 300 char *wmesg, int catch, int timo) 365 int s, ret; 301 { 366 302 int s, ret; 367 /* 303 368 * When the system is panicing, the lock manager grants all lock 304 s = splbio(); 369 * requests whether or not the lock is available. To avoid "unlocked 305 mtx_lock(bp->b_lock.lk_interlock); 370 * buffer" panics after a crash, we just claim that all buffers 306 locktype |= LK_INTERNAL | LK_TIMELOCK; 371 * are locked when cleaning up after a system panic. 307 bp->b_lock.lk_wmesg = wmesg; 372 */ 308 bp->b_lock.lk_prio = (PRIBIO + 4) | catch; 373 if (panicstr != NULL) 309 bp->b_lock.lk_timo = timo; 374 return (1); 310 ret = lockmgr(&(bp)->b_lock, (locktype), interlock, curthread); 375 s = splbio(); 311 splx(s); 376 ret = lockcount(&(bp)->b_lock); 312 return ret; 377 splx(s); 313 } 378 return ret; 314 /* 379 } 315 * Release a lock. Only the acquiring process may free the lock unless 380 11/15/03 01:28:09 sys/sys/buf.h 4 381 #endif /* _KERNEL */ 446 382 447 #endif /* _KERNEL */ 383 struct buf_queue_head { 448 384 TAILQ_HEAD(buf_queue, buf) queue; 449 /* 385 daddr_t last_pblkno; 450 * Zero out the buffer’s data area. 386 struct buf *insert_point; 451 */ 387 struct buf *switch_point; 452 #define clrbuf(bp) { \ 388 }; 453 bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ 389 454 (bp)->b_resid = 0; \ 390 /* 455 } 391 * This structure describes a clustered I/O. It is stored in the b_saveaddr 456 392 * field of the buffer on which I/O is done. At I/O completion, cluster 457 /* 393 * callback uses the structure to parcel I/O’s to individual buffers, and 458 * Flags for getblk’s last parameter. 394 * then free’s this structure. 459 */ 395 */ 460 #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ 396 struct cluster_save { 461 #define GB_NOCREAT 0x0002 /* Don’t create a buf if not found. */ 397 long bs_bcount; /* Saved b_bcount. */ 462 398 long bs_bufsize; /* Saved b_bufsize. */ 463 #ifdef _KERNEL 399 void *bs_saveaddr; /* Saved b_addr. */ 464 extern int nbuf; /* The number of buffer headers */ 400 int bs_nchildren; /* Number of associated buffers. */ 465 extern int maxswzone; /* Max KVA for swap structures */ 401 struct buf **bs_children; /* List of associated buffers. */ 466 extern int maxbcache; /* Max KVA for buffer cache */ 402 }; 467 extern int runningbufspace; 403 468 extern int buf_maxio; /* nominal maximum I/O for buffer */ 404 #ifdef _KERNEL 469 extern struct buf *buf; /* The buffer headers. */ 405 470 extern char *buffers; /* The buffer contents. */ 406 #define BUF_WRITE(bp) \ 471 extern int bufpages; /* Number of memory pages in the buffe 407 (bp)->b_op->bop_write(bp) r pool. */ 408 472 extern struct buf *swbuf; /* Swap I/O buffer headers. */ 409 static __inline void 473 extern int nswbuf; /* Number of swap I/O buffer headers. 410 buf_start(struct buf *bp) */ 411 { 474 412 if (bioops.io_start) 475 struct uio; 413 (*bioops.io_start)(bp); 476 414 } 477 caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); 415 478 void bufinit(void); 416 static __inline void 479 void bwillwrite(void); 417 buf_complete(struct buf *bp) 480 int buf_dirty_count_severe(void); 418 { 481 void bremfree(struct buf *); 419 if (bioops.io_complete) 482 int bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **); 420 (*bioops.io_complete)(bp); 483 int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int, 421 } 484 struct ucred *, struct buf **); 422 485 int bwrite(struct buf *); 423 static __inline void 486 void bdwrite(struct buf *); 424 buf_deallocate(struct buf *bp) 487 void bawrite(struct buf *); 425 { 488 void bdirty(struct buf *); 426 if (bioops.io_deallocate) 489 void bundirty(struct buf *); 427 (*bioops.io_deallocate)(bp); 490 void brelse(struct buf *); 428 BUF_LOCKFREE(bp); 491 void bqrelse(struct buf *); 429 } 492 int vfs_bio_awrite(struct buf *); 430 493 struct buf * getpbuf(int *); 431 static __inline void 494 struct buf *incore(struct vnode *, daddr_t); 432 buf_movedeps(struct buf *bp, struct buf *bp2) 495 struct buf *gbincore(struct vnode *, daddr_t); 433 { 496 int inmem(struct vnode *, daddr_t); 434 if (bioops.io_movedeps) 497 struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); 435 (*bioops.io_movedeps)(bp, bp2); 498 struct buf *geteblk(int); 436 } 499 int bufwait(struct buf *); 437 500 void bufdone(struct buf *); 438 static __inline int 501 439 buf_countdeps(struct buf *bp, int i) 502 void cluster_callback(struct buf *); 440 { 503 int cluster_read(struct vnode *, u_quad_t, daddr_t, long, 441 if (bioops.io_countdeps) 504 struct ucred *, long, int, struct buf **); 442 return ((*bioops.io_countdeps)(bp, i)); 505 int cluster_wbuild(struct vnode *, long, daddr_t, int); 443 else 506 void cluster_write(struct buf *, u_quad_t, int); 444 return (0); 507 void vfs_bio_set_validclean(struct buf *, int base, int size); 445 } 508 void vfs_bio_clrbuf(struct buf *); 11/15/03 01:28:09 sys/sys/buf.h 5 509 void vfs_busy_pages(struct buf *, int clear_modify); 510 void vfs_unbusy_pages(struct buf *); 511 void vwakeup(struct buf *); 512 int vmapbuf(struct buf *); 513 void vunmapbuf(struct buf *); 514 void relpbuf(struct buf *, int *); 515 void brelvp(struct buf *); 516 void bgetvp(struct vnode *, struct buf *); 517 void pbgetvp(struct vnode *, struct buf *); 518 void pbrelvp(struct buf *); 519 int allocbuf(struct buf *bp, int size); 520 void reassignbuf(struct buf *, struct vnode *); 521 struct buf *trypbuf(int *); 522 void bwait(struct buf *, u_char, const char *); 523 void bdone(struct buf *); 524 525 #endif /* _KERNEL */ 526 527 #endif /* !_SYS_BUF_H_ */ 06/22/03 01:41:43 sys/sys/file.h 1 1 /* 66 #ifdef _KERNEL 2 * Copyright (c) 1982, 1986, 1989, 1993 67 3 * The Regents of the University of California. All rights reserved. 68 struct file; 4 * 69 struct ucred; 5 * Redistribution and use in source and binary forms, with or without 70 6 * modification, are permitted provided that the following conditions 71 typedef int fo_rdwr_t(struct file *fp, struct uio *uio, 7 * are met: 72 struct ucred *active_cred, int flags, 8 * 1. Redistributions of source code must retain the above copyright 73 struct thread *td); 9 * notice, this list of conditions and the following disclaimer. 74 #define FOF_OFFSET 1 /* Use the offset in uio argument */ 10 * 2. Redistributions in binary form must reproduce the above copyright 75 typedef int fo_ioctl_t(struct file *fp, u_long com, void *data, 11 * notice, this list of conditions and the following disclaimer in the 76 struct ucred *active_cred, struct thread *td); 12 * documentation and/or other materials provided with the distribution. 77 typedef int fo_poll_t(struct file *fp, int events, 13 * 3. All advertising materials mentioning features or use of this software 78 struct ucred *active_cred, struct thread *td); 14 * must display the following acknowledgement: 79 typedef int fo_kqfilter_t(struct file *fp, struct knote *kn); 15 * This product includes software developed by the University of 80 typedef int fo_stat_t(struct file *fp, struct stat *sb, 16 * California, Berkeley and its contributors. 81 struct ucred *active_cred, struct thread *td); 17 * 4. Neither the name of the University nor the names of its contributors 82 typedef int fo_close_t(struct file *fp, struct thread *td); 18 * may be used to endorse or promote products derived from this software 83 typedef int fo_flags_t; 19 * without specific prior written permission. 84 20 * 85 struct fileops { 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 86 fo_rdwr_t *fo_read; 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 fo_rdwr_t *fo_write; 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 fo_ioctl_t *fo_ioctl; 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 89 fo_poll_t *fo_poll; 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 fo_kqfilter_t *fo_kqfilter; 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 fo_stat_t *fo_stat; 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 fo_close_t *fo_close; 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 fo_flags_t fo_flags; /* DFLAG_* below */ 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 }; 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 31 * SUCH DAMAGE. 96 #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ 32 * 97 #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ 33 * @(#)file.h 8.3 (Berkeley) 1/9/95 98 34 * $FreeBSD: src/sys/sys/file.h,v 1.63 2003/06/22 08:41:43 phk Exp $ 99 /* 35 */ 100 * Kernel descriptor table. 36 101 * One entry for each open kernel vnode and socket. 37 #ifndef _SYS_FILE_H_ 102 * 38 #define _SYS_FILE_H_ 103 * Below is the list of locks that protects members in struct file. 39 104 * 40 #ifndef _KERNEL 105 * (fl) filelist_lock 41 #include /* XXX */ 106 * (f) f_mtx in struct file 42 #include 107 * none not locked 43 #include 108 */ 44 #else 109 45 #include 110 struct file { 46 #include 111 LIST_ENTRY(file) f_list;/* (fl) list of active files */ 47 #include 112 short f_type; /* descriptor type */ 48 113 void *f_data; /* file descriptor specific data */ 49 struct stat; 114 u_int f_flag; /* see fcntl.h */ 50 struct thread; 115 struct mtx *f_mtxp; /* mutex to protect data */ 51 struct uio; 116 struct fileops *f_ops; /* File operations */ 52 struct knote; 117 struct ucred *f_cred; /* credentials associated with descriptor */ 53 struct vnode; 118 int f_count; /* (f) reference count */ 54 struct socket; 119 struct vnode *f_vnode; /* NULL or applicable vnode */ 55 120 56 121 /* DFLAG_SEEKABLE specific fields */ 57 #endif /* _KERNEL */ 122 off_t f_offset; 58 123 59 #define DTYPE_VNODE 1 /* file */ 124 /* DTYPE_SOCKET specific fields */ 60 #define DTYPE_SOCKET 2 /* communications endpoint */ 125 short f_gcflag; /* used by thread doing fd garbage collection 61 #define DTYPE_PIPE 3 /* pipe */ */ 62 #define DTYPE_FIFO 4 /* fifo (named pipe) */ 126 #define FMARK 0x1 /* mark during gc() */ 63 #define DTYPE_KQUEUE 5 /* event queue */ 127 #define FDEFER 0x2 /* defer for next gc pass */ 64 #define DTYPE_CRYPTO 6 /* crypto */ 128 int f_msgcount; /* (f) references from message queue */ 65 129 06/22/03 01:41:43 sys/sys/file.h 2 130 /* DTYPE_VNODE specific fields */ 194 /* Lock a file. */ 131 int f_seqcount; /* 195 #define FILE_LOCK(f) mtx_lock((f)->f_mtxp) 132 * count of sequential accesses -- cleared 196 #define FILE_UNLOCK(f) mtx_unlock((f)->f_mtxp) 133 * by most seek operations. 197 #define FILE_LOCKED(f) mtx_owned((f)->f_mtxp) 134 */ 198 #define FILE_LOCK_ASSERT(f, type) mtx_assert((f)->f_mtxp, (type)) 135 off_t f_nextoff; /* 199 136 * offset of next expected read or write 200 int fgetvp(struct thread *td, int fd, struct vnode **vpp); 137 */ 201 int fgetvp_read(struct thread *td, int fd, struct vnode **vpp); 138 }; 202 int fgetvp_write(struct thread *td, int fd, struct vnode **vpp); 139 203 140 #endif /* _KERNEL */ 204 int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp); 141 205 void fputsock(struct socket *sp); 142 /* 206 143 * Userland version of struct file, for 207 #define fhold_locked(fp) \ 144 */ 208 do { \ 145 struct xfile { 209 FILE_LOCK_ASSERT(fp, MA_OWNED); \ 146 size_t xf_size; /* size of struct xfile */ 210 (fp)->f_count++; \ 147 pid_t xf_pid; /* owning process */ 211 } while (0) 148 uid_t xf_uid; /* effective uid of owning process */ 212 149 int xf_fd; /* descriptor number */ 213 #define fhold(fp) \ 150 void *xf_file; /* address of struct file */ 214 do { \ 151 short xf_type; /* descriptor type */ 215 FILE_LOCK(fp); \ 152 int xf_count; /* reference count */ 216 fhold_locked(fp); \ 153 int xf_msgcount; /* references from message queue */ 217 FILE_UNLOCK(fp); \ 154 off_t xf_offset; /* file offset */ 218 } while (0) 155 void *xf_data; /* file descriptor specific data */ 219 156 u_int xf_flag; /* flags (see fcntl.h) */ 220 static __inline fo_rdwr_t fo_read; 157 }; 221 static __inline fo_rdwr_t fo_write; 158 222 static __inline fo_ioctl_t fo_ioctl; 159 #ifdef _KERNEL 223 static __inline fo_poll_t fo_poll; 160 224 static __inline fo_kqfilter_t fo_kqfilter; 161 #ifdef MALLOC_DECLARE 225 static __inline fo_stat_t fo_stat; 162 MALLOC_DECLARE(M_FILE); 226 static __inline fo_close_t fo_close; 163 #endif 227 164 228 static __inline int 165 LIST_HEAD(filelist, file); 229 fo_read(fp, uio, active_cred, flags, td) 166 extern struct filelist filehead; /* (fl) head of list of open files */ 230 struct file *fp; 167 extern struct fileops vnops; 231 struct uio *uio; 168 extern struct fileops badfileops; 232 struct ucred *active_cred; 169 extern struct fileops socketops; 233 int flags; 170 extern int maxfiles; /* kernel limit on number of open files */ 234 struct thread *td; 171 extern int maxfilesperproc; /* per process limit on number of open files * 235 { / 236 172 extern int nfiles; /* (fl) actual number of open files */ 237 return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td)); 173 extern struct sx filelist_lock; /* sx to protect filelist and nfiles */ 238 } 174 239 175 int fget(struct thread *td, int fd, struct file **fpp); 240 static __inline int 176 int fget_read(struct thread *td, int fd, struct file **fpp); 241 fo_write(fp, uio, active_cred, flags, td) 177 int fget_write(struct thread *td, int fd, struct file **fpp); 242 struct file *fp; 178 int fdrop(struct file *fp, struct thread *td); 243 struct uio *uio; 179 int fdrop_locked(struct file *fp, struct thread *td); 244 struct ucred *active_cred; 180 245 int flags; 181 /* 246 struct thread *td; 182 * The socket operations are used a couple of places. 247 { 183 * XXX: This is wrong, they should go through the operations vector for 248 184 * XXX: sockets instead of going directly for the individual functions. /phk 249 return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td)); 185 */ 250 } 186 fo_rdwr_t soo_read; 251 187 fo_rdwr_t soo_write; 252 static __inline int 188 fo_ioctl_t soo_ioctl; 253 fo_ioctl(fp, com, data, active_cred, td) 189 fo_poll_t soo_poll; 254 struct file *fp; 190 fo_kqfilter_t soo_kqfilter; 255 u_long com; 191 fo_stat_t soo_stat; 256 void *data; 192 fo_close_t soo_close; 257 struct ucred *active_cred; 193 258 struct thread *td; 06/22/03 01:41:43 sys/sys/file.h 3 259 { 260 261 return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td)); 262 } 263 264 static __inline int 265 fo_poll(fp, events, active_cred, td) 266 struct file *fp; 267 int events; 268 struct ucred *active_cred; 269 struct thread *td; 270 { 271 272 return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td)); 273 } 274 275 static __inline int 276 fo_stat(fp, sb, active_cred, td) 277 struct file *fp; 278 struct stat *sb; 279 struct ucred *active_cred; 280 struct thread *td; 281 { 282 283 return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td)); 284 } 285 286 static __inline int 287 fo_close(fp, td) 288 struct file *fp; 289 struct thread *td; 290 { 291 292 return ((*fp->f_ops->fo_close)(fp, td)); 293 } 294 295 static __inline int 296 fo_kqfilter(fp, kn) 297 struct file *fp; 298 struct knote *kn; 299 { 300 301 return ((*fp->f_ops->fo_kqfilter)(fp, kn)); 302 } 303 304 #endif /* _KERNEL */ 305 306 #endif /* !SYS_FILE_H */ 11/12/03 00:01:40 sys/sys/mount.h 1 1 /* */ 2 * Copyright (c) 1989, 1991, 1993 66 #define MNAMELEN 88 /* size of on/from name bufs */ 3 * The Regents of the University of California. All rights reserved. 67 #define STATFS_VERSION 0x20030518 /* current version number */ 4 * 68 struct statfs { 5 * Redistribution and use in source and binary forms, with or without 69 uint32_t f_version; /* structure version number */ 6 * modification, are permitted provided that the following conditions 70 uint32_t f_type; /* type of filesystem */ 7 * are met: 71 uint64_t f_flags; /* copy of mount exported flags */ 8 * 1. Redistributions of source code must retain the above copyright 72 uint64_t f_bsize; /* filesystem fragment size */ 9 * notice, this list of conditions and the following disclaimer. 73 uint64_t f_iosize; /* optimal transfer block size */ 10 * 2. Redistributions in binary form must reproduce the above copyright 74 uint64_t f_blocks; /* total data blocks in filesystem */ 11 * notice, this list of conditions and the following disclaimer in the 75 uint64_t f_bfree; /* free blocks in filesystem */ 12 * documentation and/or other materials provided with the distribution. 76 int64_t f_bavail; /* free blocks avail to non-superuser 13 * 3. All advertising materials mentioning features or use of this software */ 14 * must display the following acknowledgement: 77 uint64_t f_files; /* total file nodes in filesystem */ 15 * This product includes software developed by the University of 78 int64_t f_ffree; /* free nodes avail to non-superuser * 16 * California, Berkeley and its contributors. / 17 * 4. Neither the name of the University nor the names of its contributors 79 uint64_t f_syncwrites; /* count of sync writes since mount */ 18 * may be used to endorse or promote products derived from this software 80 uint64_t f_asyncwrites; /* count of async writes since mount * 19 * without specific prior written permission. / 20 * 81 uint64_t f_syncreads; /* count of sync reads since mount */ 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 82 uint64_t f_asyncreads; /* count of async reads since mount */ 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 83 uint64_t f_spare[10]; /* unused spare */ 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 84 uint32_t f_namemax; /* maximum filename length */ 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 85 uid_t f_owner; /* user that mounted the filesystem */ 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 86 fsid_t f_fsid; /* filesystem id */ 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 87 char f_charspare[80]; /* spare string space */ 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 88 char f_fstypename[MFSNAMELEN]; /* filesystem type name */ 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 89 char f_mntfromname[MNAMELEN]; /* mounted filesystem */ 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 90 char f_mntonname[MNAMELEN]; /* directory on which mounted */ 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 91 }; 31 * SUCH DAMAGE. 92 32 * 93 #ifdef _KERNEL 33 * @(#)mount.h 8.21 (Berkeley) 5/20/95 94 #define OMFSNAMELEN 16 /* length of fs type name, including null */ 34 * $FreeBSD: src/sys/sys/mount.h,v 1.152 2003/11/12 08:01:40 mckusick Exp $ 95 #define OMNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs * 35 */ / 36 96 37 #ifndef _SYS_MOUNT_H_ 97 /* XXX getfsstat.2 is out of date with write and read counter changes here. */ 38 #define _SYS_MOUNT_H_ 98 /* XXX statfs.2 is out of date with read counter changes here. */ 39 99 struct ostatfs { 40 #include 100 long f_spare2; /* placeholder */ 41 #include 101 long f_bsize; /* fundamental filesystem block size * 42 #ifdef _KERNEL / 43 #include 102 long f_iosize; /* optimal transfer block size */ 44 #include 103 long f_blocks; /* total data blocks in filesystem */ 45 #include 104 long f_bfree; /* free blocks in fs */ 46 #endif 105 long f_bavail; /* free blocks avail to non-superuser 47 */ 48 typedef struct fsid { int32_t val[2]; } fsid_t; /* filesystem id type */ 106 long f_files; /* total file nodes in filesystem */ 49 107 long f_ffree; /* free file nodes in fs */ 50 /* 108 fsid_t f_fsid; /* filesystem id */ 51 * File identifier. 109 uid_t f_owner; /* user that mounted the filesystem */ 52 * These are unique per filesystem on a single machine. 110 int f_type; /* type of filesystem */ 53 */ 111 int f_flags; /* copy of mount exported flags */ 54 #define MAXFIDSZ 16 112 long f_syncwrites; /* count of sync writes since mount */ 55 113 long f_asyncwrites; /* count of async writes since mount * 56 struct fid { / 57 u_short fid_len; /* length of data in bytes */ 114 char f_fstypename[OMFSNAMELEN]; /* fs type name */ 58 u_short fid_reserved; /* force longword alignment */ 115 char f_mntonname[OMNAMELEN]; /* directory on which mounted */ 59 char fid_data[MAXFIDSZ]; /* data (variable length) */ 116 long f_syncreads; /* count of sync reads since mount */ 60 }; 117 long f_asyncreads; /* count of async reads since mount */ 61 118 short f_spares1; /* unused spare */ 62 /* 119 char f_mntfromname[OMNAMELEN];/* mounted filesystem */ 63 * filesystem statistics 120 short f_spares2; /* unused spare */ 64 */ 121 /* 65 #define MFSNAMELEN 16 /* length of type name including null 122 * XXX on machines where longs are aligned to 8-byte boundaries, there 11/12/03 00:01:40 sys/sys/mount.h 2 123 * is an unnamed int32_t here. This spare was after the apparent end 180 #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) 124 * of the struct until we bit off the read counters from f_mntonname. 181 #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) 125 */ 182 126 long f_spare[2]; /* unused spare */ 183 #endif /* _KERNEL */ 127 }; 184 128 185 /* 129 #define MMAXOPTIONLEN 65536 /* maximum length of a mount option */ 186 * User specifiable flags. 130 187 */ 131 TAILQ_HEAD(vnodelst, vnode); 188 #define MNT_RDONLY 0x00000001 /* read only filesystem */ 132 TAILQ_HEAD(vfsoptlist, vfsopt); 189 #define MNT_SYNCHRONOUS 0x00000002 /* filesystem written synchronously */ 133 struct vfsopt { 190 #define MNT_NOEXEC 0x00000004 /* can’t exec from filesystem */ 134 TAILQ_ENTRY(vfsopt) link; 191 #define MNT_NOSUID 0x00000008 /* don’t honor setuid bits on fs */ 135 char *name; 192 #define MNT_NODEV 0x00000010 /* don’t interpret special files */ 136 void *value; 193 #define MNT_UNION 0x00000020 /* union with underlying filesystem */ 137 int len; 194 #define MNT_ASYNC 0x00000040 /* filesystem written asynchronously * 138 }; / 139 195 #define MNT_SUIDDIR 0x00100000 /* special handling of SUID on dirs */ 140 /* 196 #define MNT_SOFTDEP 0x00200000 /* being done */ 141 * Structure per mounted filesystem. Each mounted filesystem has an 197 #define MNT_NOSYMFOLLOW 0x00400000 /* do not follow symlinks */ 142 * array of operations and an instance record. The filesystems are 198 #define MNT_JAILDEVFS 0x02000000 /* jail-friendly DEVFS behaviour */ 143 * put on a doubly linked list. 199 #define MNT_MULTILABEL 0x04000000 /* MAC support for individual objects 144 * */ 145 * NOTE: mnt_nvnodelist and mnt_reservedvnlist. At the moment vnodes 200 #define MNT_ACLS 0x08000000 /* ACL support enabled */ 146 * are linked into mnt_nvnodelist. At some point in the near future the 201 #define MNT_NOATIME 0x10000000 /* disable update of file access time 147 * vnode list will be split into a ’dirty’ and ’clean’ list. mnt_nvnodelist */ 148 * will become the dirty list and mnt_reservedvnlist will become the ’clean’ 202 #define MNT_NOCLUSTERR 0x40000000 /* disable cluster read */ 149 * list. Filesystem kld’s syncing code should remain compatible since 203 #define MNT_NOCLUSTERW 0x80000000 /* disable cluster write */ 150 * they only need to scan the dirty vnode list (nvnodelist -> dirtyvnodelist). 204 151 */ 205 /* 152 struct mount { 206 * NFS export related mount flags. 153 TAILQ_ENTRY(mount) mnt_list; /* mount list */ 207 */ 154 struct vfsops *mnt_op; /* operations on fs */ 208 #define MNT_EXRDONLY 0x00000080 /* exported read only */ 155 struct vfsconf *mnt_vfc; /* configuration info */ 209 #define MNT_EXPORTED 0x00000100 /* filesystem is exported */ 156 struct vnode *mnt_vnodecovered; /* vnode we mounted on */ 210 #define MNT_DEFEXPORTED 0x00000200 /* exported to the world */ 157 struct vnode *mnt_syncer; /* syncer vnode */ 211 #define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for everyone * 158 struct vnodelst mnt_nvnodelist; /* list of vnodes this mount * / / 212 #define MNT_EXKERB 0x00000800 /* exported with Kerberos uid mapping 159 struct vnodelst mnt_reservedvnlist; /* (future) dirty vnode list * */ / 213 #define MNT_EXPUBLIC 0x20000000 /* public export (WebNFS) */ 160 struct lock mnt_lock; /* mount structure lock */ 214 161 struct mtx mnt_mtx; /* mount structure interlock * 215 /* / 216 * Flags set by internal operations, 162 int mnt_writeopcount; /* write syscalls in progress 217 * but visible to the user. */ 218 * XXX some of these are not quite right.. (I’ve never seen the root flag set) 163 int mnt_flag; /* flags shared with user */ 219 */ 164 struct vfsoptlist *mnt_opt; /* current mount options */ 220 #define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ 165 struct vfsoptlist *mnt_optnew; /* new options passed to fs */ 221 #define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ 166 int mnt_kern_flag; /* kernel only flags */ 222 #define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ 167 int mnt_maxsymlinklen; /* max size of short symlink * 223 #define MNT_USER 0x00008000 /* mounted by a user */ / 224 #define MNT_IGNORE 0x00800000 /* do not show entry in df */ 168 struct statfs mnt_stat; /* cache of filesystem stats * 225 / 226 /* 169 struct ucred *mnt_cred; /* credentials of mounter */ 227 * Mask of flags that are visible to statfs(). 170 qaddr_t mnt_data; /* private data */ 228 * XXX I think that this could now become (˜(MNT_CMDFLAGS)) 171 time_t mnt_time; /* last time written*/ 229 * but the ’mount’ program may need changing to handle this. 172 int mnt_iosize_max; /* max size for clusters, etc 230 */ */ 231 #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ 173 struct netexport *mnt_export; /* export list */ 232 MNT_NOSUID | MNT_NODEV | MNT_UNION | \ 174 struct label *mnt_mntlabel; /* MAC label for the mount */ 233 MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ 175 struct label *mnt_fslabel; /* MAC label for the fs */ 234 MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ 176 int mnt_nvnodelistsize; /* # of vnodes on this mount * 235 MNT_LOCAL | MNT_USER | MNT_QUOTA | \ / 236 MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ 177 }; 237 MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ 178 238 MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ 179 239 MNT_JAILDEVFS | MNT_MULTILABEL | MNT_ACLS) 11/12/03 00:01:40 sys/sys/mount.h 3 240 305 #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ 241 /* Mask of flags that can be updated. */ 306 #define MNT_LAZY 3 /* push data not written by filesystem syncer 242 #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | \ */ 243 MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | \ 307 244 MNT_NOATIME | \ 308 /* 245 MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_JAILDEVFS | \ 309 * Generic file handle 246 MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ 310 */ 247 MNT_ACLS ) 311 struct fhandle { 248 312 fsid_t fh_fsid; /* Filesystem id of mount point */ 249 /* 313 struct fid fh_fid; /* Filesys specific id */ 250 * External filesystem command modifier flags. 314 }; 251 * Unmount can use the MNT_FORCE flag. 315 typedef struct fhandle fhandle_t; 252 * XXX These are not STATES and really should be somewhere else. 316 253 */ 317 /* 254 #define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ 318 * Export arguments for local filesystem mount calls. 255 #define MNT_DELEXPORT 0x00020000 /* delete export host lists */ 319 */ 256 #define MNT_RELOAD 0x00040000 /* reload filesystem data */ 320 struct export_args { 257 #define MNT_FORCE 0x00080000 /* force unmount or readonly change */ 321 int ex_flags; /* export related flags */ 258 #define MNT_SNAPSHOT 0x01000000 /* snapshot the filesystem */ 322 uid_t ex_root; /* mapping for root uid */ 259 #define MNT_BYFSID 0x08000000 /* specify filesystem by ID. */ 323 struct xucred ex_anon; /* mapping for anonymous user */ 260 #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ 324 struct sockaddr *ex_addr; /* net address to which exported */ 261 MNT_FORCE | MNT_SNAPSHOT | MNT_BYFSID) 325 u_char ex_addrlen; /* and the net address length */ 262 /* 326 struct sockaddr *ex_mask; /* mask of valid bits in saddr */ 263 * Internal filesystem control flags stored in mnt_kern_flag. 327 u_char ex_masklen; /* and the smask length */ 264 * 328 char *ex_indexfile; /* index file for WebNFS URLs */ 265 * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed 329 }; 266 * past the mount point. This keeps the subtree stable during mounts 330 267 * and unmounts. 331 /* 268 * 332 * Structure holding information for a publicly exported filesystem 269 * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while 333 * (WebNFS). Currently the specs allow just for one such filesystem. 270 * dounmount() is still waiting to lock the mountpoint. This allows 334 */ 271 * the filesystem to cancel operations that might otherwise deadlock 335 struct nfs_public { 272 * with the unmount attempt (used by NFS). 336 int np_valid; /* Do we hold valid information */ 273 */ 337 fhandle_t np_handle; /* Filehandle for pub fs (internal) */ 274 #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ 338 struct mount *np_mount; /* Mountpoint of exported fs */ 275 #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ 339 char *np_index; /* Index file */ 276 #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ 340 }; 277 #define MNTK_WANTRDWR 0x04000000 /* upgrade to read/write requested */ 341 278 #define MNTK_SUSPEND 0x08000000 /* request write suspension */ 342 /* 279 #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ 343 * Filesystem configuration information. One of these exists for each 280 344 * type of filesystem supported by the kernel. These are searched at 281 /* 345 * mount time to identify the requested filesystem. 282 * Sysctl CTL_VFS definitions. 346 */ 283 * 347 struct vfsconf { 284 * Second level identifier specifies which filesystem. Second level 348 struct vfsops *vfc_vfsops; /* filesystem operations vector */ 285 * identifier VFS_VFSCONF returns information about all filesystems. 349 char vfc_name[MFSNAMELEN]; /* filesystem type name */ 286 * Second level identifier VFS_GENERIC is non-terminal. 350 int vfc_typenum; /* historic filesystem type number */ 287 */ 351 int vfc_refcount; /* number mounted of this type */ 288 #define VFS_VFSCONF 0 /* get configured filesystems */ 352 int vfc_flags; /* permanent flags */ 289 #define VFS_GENERIC 0 /* generic filesystem information */ 353 struct vfsoptdecl *vfc_opts; /* mount options */ 290 /* 354 struct vfsconf *vfc_next; /* next in list */ 291 * Third level identifiers for VFS_GENERIC are given below; third 355 }; 292 * level identifiers for specific filesystems are given in their 356 293 * mount specific header files. 357 /* Userland version of the struct vfsconf. */ 294 */ 358 struct xvfsconf { 295 #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ 359 struct vfsops *vfc_vfsops; /* filesystem operations vector */ 296 #define VFS_CONF 2 /* struct: vfsconf for filesystem given 360 char vfc_name[MFSNAMELEN]; /* filesystem type name */ 297 as next argument */ 361 int vfc_typenum; /* historic filesystem type number */ 298 362 int vfc_refcount; /* number mounted of this type */ 299 /* 363 int vfc_flags; /* permanent flags */ 300 * Flags for various system call interfaces. 364 struct vfsconf *vfc_next; /* next in list */ 301 * 365 }; 302 * waitfor flags to vfs_sync() and getfsstat() 366 303 */ 367 struct ovfsconf { 304 #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ 368 void *vfc_vfsops; 11/12/03 00:01:40 sys/sys/mount.h 4 369 char vfc_name[32]; 432 vfs_unmount_t *vfs_unmount; 370 int vfc_index; 433 vfs_root_t *vfs_root; 371 int vfc_refcount; 434 vfs_quotactl_t *vfs_quotactl; 372 int vfc_flags; 435 vfs_statfs_t *vfs_statfs; 373 }; 436 vfs_sync_t *vfs_sync; 374 437 vfs_vget_t *vfs_vget; 375 /* 438 vfs_fhtovp_t *vfs_fhtovp; 376 * NB: these flags refer to IMPLEMENTATION properties, not properties of 439 vfs_checkexp_t *vfs_checkexp; 377 * any actual mounts; i.e., it does not make sense to change the flags. 440 vfs_vptofh_t *vfs_vptofh; 378 */ 441 vfs_init_t *vfs_init; 379 #define VFCF_STATIC 0x00010000 /* statically compiled into kernel */ 442 vfs_uninit_t *vfs_uninit; 380 #define VFCF_NETWORK 0x00020000 /* may get data over the network */ 443 vfs_extattrctl_t *vfs_extattrctl; 381 #define VFCF_READONLY 0x00040000 /* writes are not implemented */ 444 /* Additions below are not binary compatible with 5.0 and below. */ 382 #define VFCF_SYNTHETIC 0x00080000 /* data does not represent real files 445 vfs_nmount_t *vfs_nmount; */ 446 }; 383 #define VFCF_LOOPBACK 0x00100000 /* aliases some other mounted FS */ 447 384 #define VFCF_UNICODE 0x00200000 /* stores file names as Unicode*/ 448 #define VFS_NMOUNT(MP, NDP, P) (*(MP)->mnt_op->vfs_nmount)(MP, NDP, P) 385 449 #define VFS_MOUNT(MP, PATH, DATA, NDP, P) \ 386 struct iovec; 450 (*(MP)->mnt_op->vfs_mount)(MP, PATH, DATA, NDP, P) 387 struct uio; 451 #define VFS_START(MP, FLAGS, P) (*(MP)->mnt_op->vfs_start)(MP, FLAGS, P) 388 452 #define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P) 389 #ifdef _KERNEL 453 #define VFS_ROOT(MP, VPP) (*(MP)->mnt_op->vfs_root)(MP, VPP) 390 454 #define VFS_QUOTACTL(MP,C,U,A,P) (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P 391 #ifdef MALLOC_DECLARE ) 392 MALLOC_DECLARE(M_MOUNT); 455 #define VFS_STATFS(MP, SBP, P) (*(MP)->mnt_op->vfs_statfs)(MP, SBP, P) 393 #endif 456 #define VFS_SYNC(MP, WAIT, C, P) (*(MP)->mnt_op->vfs_sync)(MP, WAIT, C, P) 394 extern int maxvfsconf; /* highest defined filesystem type */ 457 #define VFS_VGET(MP, INO, FLAGS, VPP) \ 395 extern int nfs_mount_type; /* vfc_typenum for nfs, or -1 */ 458 (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP) 396 extern struct vfsconf *vfsconf; /* head of list of filesystem types */ 459 #define VFS_FHTOVP(MP, FIDP, VPP) \ 397 460 (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, VPP) 398 /* 461 #define VFS_VPTOFH(VP, FIDP) (*(VP)->v_mount->mnt_op->vfs_vptofh)(VP, FID 399 * Operations supported on mounted filesystem. P) 400 */ 462 #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED) \ 401 struct mount_args; 463 (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED) 402 struct nameidata; 464 #define VFS_EXTATTRCTL(MP, C, FN, NS, N, P) \ 403 465 (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N, P) 404 typedef int vfs_mount_t(struct mount *mp, char *path, caddr_t data, 466 405 struct nameidata *ndp, struct thread *td); 467 #include 406 typedef int vfs_start_t(struct mount *mp, int flags, struct thread *td); 468 407 typedef int vfs_unmount_t(struct mount *mp, int mntflags, struct thread *td); 469 #define VFS_SET(vfsops, fsname, flags) \ 408 typedef int vfs_root_t(struct mount *mp, struct vnode **vpp); 470 static struct vfsconf fsname ## _vfsconf = { \ 409 typedef int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, 471 &vfsops, \ 410 caddr_t arg, struct thread *td); 472 #fsname, \ 411 typedef int vfs_statfs_t(struct mount *mp, struct statfs *sbp, 473 -1, \ 412 struct thread *td); 474 0, \ 413 typedef int vfs_sync_t(struct mount *mp, int waitfor, struct ucred *cred, 475 flags \ 414 struct thread *td); 476 }; \ 415 typedef int vfs_vget_t(struct mount *mp, ino_t ino, int flags, 477 static moduledata_t fsname ## _mod = { \ 416 struct vnode **vpp); 478 #fsname, \ 417 typedef int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, struct vnode **vpp 479 vfs_modevent, \ ); 480 & fsname ## _vfsconf \ 418 typedef int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam, 481 }; \ 419 int *extflagsp, struct ucred **credanonp); 482 DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE) 420 typedef int vfs_vptofh_t(struct vnode *vp, struct fid *fhp); 483 421 typedef int vfs_init_t(struct vfsconf *); 484 extern char *mountrootfsname; 422 typedef int vfs_uninit_t(struct vfsconf *); 485 423 typedef int vfs_extattrctl_t(struct mount *mp, int cmd, 486 /* 424 struct vnode *filename_vp, int attrnamespace, 487 * exported vnode operations 425 const char *attrname, struct thread *td); 488 */ 426 typedef int vfs_nmount_t(struct mount *mp, struct nameidata *ndp, 489 int dounmount(struct mount *, int, struct thread *td); 427 struct thread *td); 490 int kernel_mount(struct iovec *iovp, unsigned int iovcnt, int flags); 428 491 int kernel_vmount(int flags, ...); 429 struct vfsops { 492 int vfs_getopt(struct vfsoptlist *, const char *, void **, int *); 430 vfs_mount_t *vfs_mount; 493 int vfs_copyopt(struct vfsoptlist *, const char *, void *, int); 431 vfs_start_t *vfs_start; 494 int vfs_mount(struct thread *td, const char *type, char *path, 11/12/03 00:01:40 sys/sys/mount.h 5 495 int flags, void *data); 558 int getfsstat(struct statfs *, long, int); 496 int vfs_setpublicfs /* set publicly exported fs */ 559 int getmntinfo(struct statfs **, int); 497 (struct mount *, struct netexport *, struct export_args *); 560 int mount(const char *, const char *, int, void *); 498 int vfs_lock(struct mount *); /* lock a vfs */ 561 int nmount(struct iovec *, u_int, int); 499 void vfs_msync(struct mount *, int); 562 int statfs(const char *, struct statfs *); 500 void vfs_unlock(struct mount *); /* unlock a vfs */ 563 int unmount(const char *, int); 501 int vfs_busy(struct mount *, int, struct mtx *, struct thread *td); 564 502 int vfs_export /* process mount export info */ 565 /* C library stuff */ 503 (struct mount *, struct export_args *); 566 void endvfsent(void); 504 struct netcred *vfs_export_lookup /* lookup host in fs export list * 567 struct ovfsconf *getvfsbytype(int); / 568 struct ovfsconf *getvfsent(void); 505 (struct mount *, struct sockaddr *); 569 int getvfsbyname(const char *, struct xvfsconf *); 506 int vfs_allocate_syncvnode(struct mount *); 570 void setvfsent(int); 507 void vfs_getnewfsid(struct mount *); 571 int vfsisloadable(const char *); 508 dev_t vfs_getrootfsid(struct mount *); 572 int vfsload(const char *); 509 struct mount *vfs_getvfs(fsid_t *); /* return vfs given fsid */ 573 __END_DECLS 510 int vfs_modevent(module_t, int, void *); 574 511 int vfs_mountedon(struct vnode *); /* is a vfs mounted on vp */ 575 #endif /* _KERNEL */ 512 void vfs_mountroot(void); /* mount our root filesystem * 576 / 577 #endif /* !_SYS_MOUNT_H_ */ 513 int vfs_rootmountalloc(char *, char *, struct mount **); 514 void vfs_mount_destroy(struct mount *, struct thread *); 515 void vfs_unbusy(struct mount *, struct thread *td); 516 void vfs_unmountall(void); 517 int vfs_register(struct vfsconf *); 518 int vfs_unregister(struct vfsconf *); 519 extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ 520 extern struct mtx mountlist_mtx; 521 extern struct nfs_public nfs_pub; 522 523 /* 524 * Declarations for these vfs default operations are located in 525 * kern/vfs_default.c, they should be used instead of making "dummy" 526 * functions or casting entries in the VFS op table to "enopnotsupp()". 527 */ 528 vfs_start_t vfs_stdstart; 529 vfs_root_t vfs_stdroot; 530 vfs_quotactl_t vfs_stdquotactl; 531 vfs_statfs_t vfs_stdstatfs; 532 vfs_sync_t vfs_stdsync; 533 vfs_sync_t vfs_stdnosync; 534 vfs_vget_t vfs_stdvget; 535 vfs_fhtovp_t vfs_stdfhtovp; 536 vfs_checkexp_t vfs_stdcheckexp; 537 vfs_vptofh_t vfs_stdvptofh; 538 vfs_init_t vfs_stdinit; 539 vfs_uninit_t vfs_stduninit; 540 vfs_extattrctl_t vfs_stdextattrctl; 541 542 /* XXX - these should be indirect functions!!! */ 543 int softdep_fsync(struct vnode *); 544 int softdep_process_worklist(struct mount *); 545 546 #else /* !_KERNEL */ 547 548 #include 549 550 struct stat; 551 552 __BEGIN_DECLS 553 int fhopen(const struct fhandle *, int); 554 int fhstat(const struct fhandle *, struct stat *); 555 int fhstatfs(const struct fhandle *, struct statfs *); 556 int fstatfs(int, struct statfs *); 557 int getfh(const char *, fhandle_t *); 11/15/03 15:57:19 sys/sys/proc.h 1 1 /*- / 2 * Copyright (c) 1986, 1989, 1991, 1993 66 3 * The Regents of the University of California. All rights reserved. 67 /* 4 * (c) UNIX System Laboratories, Inc. 68 * One structure allocated per session. 5 * All or some portions of this file are derived from material licensed 69 * 6 * to the University of California by American Telephone and Telegraph 70 * List of locks 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 71 * (m) locked by s_mtx mtx 8 * the permission of UNIX System Laboratories, Inc. 72 * (e) locked by proctree_lock sx 9 * 73 * (c) const until freeing 10 * Redistribution and use in source and binary forms, with or without 74 */ 11 * modification, are permitted provided that the following conditions 75 struct session { 12 * are met: 76 int s_count; /* (m) Ref cnt; pgrps in session. */ 13 * 1. Redistributions of source code must retain the above copyright 77 struct proc *s_leader; /* (m + e) Session leader. */ 14 * notice, this list of conditions and the following disclaimer. 78 struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ 15 * 2. Redistributions in binary form must reproduce the above copyright 79 struct tty *s_ttyp; /* (m) Controlling tty. */ 16 * notice, this list of conditions and the following disclaimer in the 80 pid_t s_sid; /* (c) Session ID. */ 17 * documentation and/or other materials provided with the distribution. 81 /* (m) Setlogin() name: */ 18 * 3. All advertising materials mentioning features or use of this software 82 char s_login[roundup(MAXLOGNAME, sizeof(long))]; 19 * must display the following acknowledgement: 83 struct mtx s_mtx; /* Mutex to protect members. */ 20 * This product includes software developed by the University of 84 }; 21 * California, Berkeley and its contributors. 85 22 * 4. Neither the name of the University nor the names of its contributors 86 /* 23 * may be used to endorse or promote products derived from this software 87 * One structure allocated per process group. 24 * without specific prior written permission. 88 * 25 * 89 * List of locks 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 90 * (m) locked by pg_mtx mtx 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 91 * (e) locked by proctree_lock sx 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 92 * (c) const until freeing 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 93 */ 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 94 struct pgrp { 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 95 LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 96 LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 97 struct session *pg_session; /* (c) Pointer to session. */ 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 98 struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 99 pid_t pg_id; /* (c) Pgrp id. */ 36 * SUCH DAMAGE. 100 int pg_jobc; /* (m) job cntl proc count */ 37 * 101 struct mtx pg_mtx; /* Mutex to protect members */ 38 * @(#)proc.h 8.15 (Berkeley) 5/19/95 102 }; 39 * $FreeBSD: src/sys/sys/proc.h,v 1.361 2003/11/15 23:57:19 imp Exp $ 103 40 */ 104 /* 41 105 * pargs, used to hold a copy of the command line, if it had a sane length. 42 #ifndef _SYS_PROC_H_ 106 */ 43 #define _SYS_PROC_H_ 107 struct pargs { 44 108 u_int ar_ref; /* Reference count. */ 45 #include /* For struct callout. */ 109 u_int ar_length; /* Length. */ 46 #include /* For struct klist. */ 110 u_char ar_args[1]; /* Arguments. */ 47 #ifndef _KERNEL 111 }; 48 #include 112 49 #endif 113 /*- 50 #include 114 * Description of a process. 51 #include 115 * 52 #include 116 * This structure contains the information needed to manage a thread of 53 #include 117 * control, known in UN*X as a process; it has references to substructures 54 #include /* XXX. */ 118 * containing descriptions of things that the process uses, but may share 55 #include 119 * with related processes. The process structure and the substructures 56 #include 120 * are always addressable except for those marked "(CPU)" below, 57 #include 121 * which might be addressable only on a processor on which the process 58 #ifndef _KERNEL 122 * is running. 59 #include /* For structs itimerval, timeval. */ 123 * 60 #else 124 * Below is a key of locks used to protect each member of struct proc. The 61 #include 125 * lock is indicated by a reference to a specific character in parens in the 62 #endif 126 * associated comment. 63 #include 127 * * - not yet protected 64 #include 128 * a - only touched by curproc or parent during fork/wait 65 #include /* Machine-dependent proc substruct. * 129 * b - created at fork, never changes 11/15/03 15:57:19 sys/sys/proc.h 2 130 * (exception aiods switch vmspaces, but they are also 195 * It represents the ability to take a slot in the scheduler queue. 131 * marked ’P_SYSTEM’ so hopefully it will be left alone) 196 * As long as this is scheduled, it could continue to run any threads that 132 * c - locked by proc mtx 197 * are assigned to the KSEGRP (see later) until either it runs out 133 * d - locked by allproc_lock lock 198 * of runnable threads of high enough priority, or CPU. 134 * e - locked by proctree_lock lock 199 * It runs on one CPU and is assigned a quantum of time. When a thread is 135 * f - session mtx 200 * blocked, The KSE continues to run and will search for another thread 136 * g - process group mtx 201 * in a runnable state amongst those it has. It May decide to return to user 137 * h - callout_lock mtx 202 * mode with a new ’empty’ thread if there are no runnable threads. 138 * i - by curproc or the master session mtx 203 * Threads are temporarily associated with a KSE for reasons. 139 * j - locked by sched_lock mtx 204 */ 140 * k - only accessed by curthread 205 struct kse; 141 * l - the attaching proc or attaching proc parent 206 142 * m - Giant 207 /* 143 * n - not locked, lazy 208 * The KSEGRP is allocated resources across a number of CPUs. 144 * o - ktrace lock 209 * (Including a number of CPUxQUANTA. It parcels these QUANTA up among 145 * p - select lock (sellock) 210 * its KSEs, each of which should be running in a different CPU. 146 * q - td_contested lock 211 * BASE priority and total available quanta are properties of a KSEGRP. 147 * r - p_peers lock 212 * Multiple KSEGRPs in a single process compete against each other 148 * x - created at fork, only changes during single threading in exec 213 * for total quanta in the same way that a forked child competes against 149 * z - zombie threads/kse/ksegroup lock 214 * it’s parent process. 150 * 215 */ 151 * If the locking key specifies two identifiers (for example, p_pptr) then 216 struct ksegrp; 152 * either lock is sufficient for read access, but both locks must be held 217 153 * for write access. 218 /* 154 */ 219 * A process is the owner of all system resources allocated to a task 155 struct ithd; 220 * except CPU quanta. 156 struct ke_sched; 221 * All KSEGs under one process see, and have the same access to, these 157 struct kg_sched; 222 * resources (e.g. files, memory, sockets, permissions kqueues). 158 struct nlminfo; 223 * A process may compete for CPU cycles on the same basis as a 159 struct p_sched; 224 * forked process cluster by spawning several KSEGRPs. 160 struct td_sched; 225 */ 161 struct trapframe; 226 struct proc; 162 struct turnstile; 227 163 228 /*************** 164 /* 229 * In pictures: 165 * Here we define the four structures used for process information. 230 With a single run queue used by all processors: 166 * 231 167 * The first is the thread. It might be though of as a "Kernel 232 RUNQ: --->KSE---KSE--... SLEEPQ:[]---THREAD---THREAD---THREAD 168 * Schedulable Entity Context". 233 | / []---THREAD 169 * This structure contains all the information as to where a thread of 234 KSEG---THREAD--THREAD--THREAD [] 170 * execution is now, or was when it was suspended, why it was suspended, 235 []---THREAD---THREAD 171 * and anything else that will be needed to restart it when it is 236 172 * rescheduled. Always associated with a KSE when running, but can be 237 (processors run THREADs from the KSEG until they are exhausted or 173 * reassigned to an equivalent KSE when being restarted for 238 the KSEG exhausts its quantum) 174 * load balancing. Each of these is associated with a kernel stack 239 175 * and a pcb. 240 With PER-CPU run queues: 176 * 241 KSEs on the separate run queues directly 177 * It is important to remember that a particular thread structure only 242 They would be given priorities calculated from the KSEG. 178 * exists as long as the system call or kernel entrance (e.g. by pagefault) 243 179 * which it is currently executing. It should therefore NEVER be referenced 244 * 180 * by pointers in long lived structures that live longer than a single 245 *****************/ 181 * request. If several threads complete their work at the same time, 246 182 * they will all rewind their stacks to the user boundary, report their 247 /* 183 * completion state, and all but one will be freed. That last one will 248 * Kernel runnable context (thread). 184 * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel 249 * This is what is put to sleep and reactivated. 185 * entrance. (basically to save freeing and then re-allocating it) The KSE 250 * The first KSE available in the correct group will run this thread. 186 * keeps a cached thread available to allow it to quickly 251 * If several are available, use the one on the same CPU as last time. 187 * get one when it needs a new one. There is also a system 252 * When waiting to be run, threads are hung off the KSEGRP in priority order. 188 * cache of free threads. Threads have priority and partake in priority 253 * with N runnable and queued KSEs in the KSEGRP, the first N threads 189 * inheritance schemes. 254 * are linked to them. Other threads are not yet assigned. 190 */ 255 */ 191 struct thread; 256 struct thread { 192 257 struct proc *td_proc; /* (*) Associated process. */ 193 /* 258 struct ksegrp *td_ksegrp; /* (*) Associated KSEG. */ 194 * The second structure is the Kernel Schedulable Entity. (KSE) 259 TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ 11/15/03 15:57:19 sys/sys/proc.h 3 260 TAILQ_ENTRY(thread) td_kglist; /* (*) All threads in this ksegrp. */ 315 * fields that must be manually set in fork1() or thread_sched_upcall() 261 316 * or already have been set in the allocator, contstructor, etc.. 262 /* The two queues below should someday be merged. */ 317 */ 263 TAILQ_ENTRY(thread) td_slpq; /* (j) Sleep queue. */ 318 struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ 264 TAILQ_ENTRY(thread) td_lockq; /* (j) Lock queue. */ 319 enum { 265 TAILQ_ENTRY(thread) td_runq; /* (j/z) Run queue(s). XXXKSE */ 320 TDS_INACTIVE = 0x0, 266 321 TDS_INHIBITED, 267 TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */ 322 TDS_CAN_RUN, 268 struct turnstile *td_turnstile; /* (k) Associated turnstile. */ 323 TDS_RUNQ, 269 324 TDS_RUNNING 270 /* Cleared during fork1() or thread_sched_upcall(). */ 325 } td_state; 271 #define td_startzero td_flags 326 register_t td_retval[2]; /* (k) Syscall aux returns. */ 272 int td_flags; /* (j) TDF_* flags. */ 327 struct callout td_slpcallout; /* (h) Callout for sleep. */ 273 int td_inhibitors; /* (j) Why can not run. */ 328 struct trapframe *td_frame; /* (k) */ 274 int td_pflags; /* (k) Private thread (TDP_*) flags. * 329 struct vm_object *td_kstack_obj;/* (a) Kstack object. */ / 330 vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ 275 struct kse *td_last_kse; /* (j) Previous value of td_kse. */ 331 int td_kstack_pages; /* (a) Size of the kstack. */ 276 struct kse *td_kse; /* (j) Current KSE if running. */ 332 struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */ 277 int td_dupfd; /* (k) Ret value from fdopen. XXX */ 333 vm_offset_t td_altkstack; /* (a) Kernel VA of alternate kstack. 278 void *td_wchan; /* (j) Sleep address. */ */ 279 const char *td_wmesg; /* (j) Reason for sleep. */ 334 int td_altkstack_pages; /* (a) Size of the alternate kstac 280 u_char td_lastcpu; /* (j) Last cpu we were on. */ k */ 281 u_char td_oncpu; /* (j) Which cpu we are on. */ 335 u_int td_critnest; /* (k) Critical section nest level. */ 282 short td_locks; /* (k) DEBUG: lockmgr count of locks. 336 struct mdthread td_md; /* (k) Any machine-dependent fields. * */ / 283 struct turnstile *td_blocked; /* (j) Lock process is blocked on. */ 337 struct td_sched *td_sched; /* (*) Scheduler-specific data. */ 284 struct ithd *td_ithd; /* (b) For interrupt threads only. */ 338 }; 285 const char *td_lockname; /* (j) Name of lock blocked on. */ 339 /* flags kept in td_flags */ 286 LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ 340 #define TDF_INPANIC 0x000002 /* Caused a panic, let it drive crashdump. */ 287 struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ 341 #define TDF_CAN_UNBIND 0x000004 /* Only temporarily bound. */ 288 int td_intr_nesting_level; /* (k) Interrupt recursion. */ 342 #define TDF_SINTR 0x000008 /* Sleep is interruptible. */ 289 int td_pinned; /* (k) Temporary cpu pin count. */ 343 #define TDF_TIMEOUT 0x000010 /* Timing out during sleep. */ 290 struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. * 344 #define TDF_IDLETD 0x000020 /* This is one of the per-CPU idle threads. * / / 291 struct ucred *td_ucred; /* (k) Reference to credentials. */ 345 #define TDF_SELECT 0x000040 /* Selecting; wakeup/waiting danger. */ 292 struct thread *td_standin; /* (*) Use this for an upcall. */ 346 #define TDF_CVWAITQ 0x000080 /* Thread is on a cv_waitq (not slpq). */ 293 u_int td_prticks; /* (*) Profclock hits in sys for user 347 #define TDF_TSNOBLOCK 0x000100 /* Don’t block on a turnstile due to race. */ */ 348 #define TDF_ONSLEEPQ 0x000200 /* On the sleep queue. */ 294 struct kse_upcall *td_upcall; /* (*) Upcall structure. */ 349 #define TDF_ASTPENDING 0x000800 /* Thread has some asynchronous events. */ 295 u_int64_t td_sticks; /* (j) Statclock hits in system mode. 350 #define TDF_TIMOFAIL 0x001000 /* Timeout from sleep after we were awake. */ */ 351 #define TDF_INTERRUPT 0x002000 /* Thread is marked as interrupted. */ 296 u_int td_uuticks; /* (*) Statclock in user, for UTS. */ 352 #define TDF_USTATCLOCK 0x004000 /* Stat clock hits in userland. */ 297 u_int td_usticks; /* (*) Statclock in kernel, for UTS. * 353 #define TDF_OWEUPC 0x008000 /* Owe thread an addupc() call at next AST. * / / 298 int td_intrval; /* (*) Return value of TDF_INTERRUPT. 354 #define TDF_NEEDRESCHED 0x010000 /* Thread needs to yield. */ */ 355 #define TDF_NEEDSIGCHK 0x020000 /* Thread may need signal delivery. */ 299 sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. * 356 #define TDF_SA 0x040000 /* A scheduler activation based thread. */ / 357 #define TDF_UMTXWAKEUP 0x080000 /* Libthr thread must not sleep on a umtx. */ 300 sigset_t td_sigmask; /* (c) Current signal mask. */ 358 #define TDF_DEADLKTREAT 0x800000 /* Lock aquisition - deadlock treatment. */ 301 sigset_t td_siglist; /* (c) Sigs arrived, not delivered. */ 359 302 sigset_t *td_waitset; /* (c) Wait set for sigwait. */ 360 /* "private" flags kept in td_pflags */ 303 TAILQ_ENTRY(thread) td_umtx; /* (c?) Link for when we’re blocked. * 361 #define TDP_OLDMASK 0x0001 /* Need to restore mask after suspend. */ / 362 #define TDP_INKTR 0x0002 /* Thread is currently in KTR code. */ 304 volatile u_int td_generation; /* (k) Enable detection of preemption 363 #define TDP_INKTRACE 0x0004 /* Thread is currently in KTRACE code. */ */ 364 #define TDP_UPCALLING 0x0008 /* This thread is doing an upcall. */ 305 365 #define TDP_COWINPROGRESS 0x0010 /* Snapshot copy-on-write in progress. */ 306 #define td_endzero td_base_pri 366 307 367 #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ 308 /* Copied during fork1() or thread_sched_upcall(). */ 368 #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ 309 #define td_startcopy td_endzero 369 #define TDI_SWAPPED 0x0004 /* Stack not in mem.. bad juju if run. */ 310 u_char td_base_pri; /* (j) Thread base kernel priority. */ 370 #define TDI_LOCK 0x0008 /* Stopped on a lock. */ 311 u_char td_priority; /* (j) Thread active priority. */ 371 #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ 312 #define td_endcopy td_pcb 372 313 373 #define TD_CAN_UNBIND(td) \ 314 /* 374 (((td)->td_flags & TDF_CAN_UNBIND) == TDF_CAN_UNBIND && \ 11/15/03 15:57:19 sys/sys/proc.h 4 375 ((td)->td_upcall != NULL)) 439 u_char ke_oncpu; /* (j) Which cpu we are on. */ 376 440 char ke_rqindex; /* (j) Run queue index. */ 377 #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) 441 enum { 378 #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) 442 KES_UNUSED = 0x0, 379 #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) 443 KES_IDLE, 380 #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) 444 KES_ONRUNQ, 381 #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) 445 KES_UNQUEUED, /* in transit */ 382 #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) 446 KES_THREAD /* slaved to thread state */ 383 #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) 447 } ke_state; /* (j) KSE status. */ 384 #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) 448 #define ke_endzero ke_dummy 385 #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) 449 u_char ke_dummy; 386 #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) 450 struct ke_sched *ke_sched; /* (*) Scheduler-specific data. */ 387 451 }; 388 #define TD_SET_INHIB(td, inhib) do { \ 452 389 (td)->td_state = TDS_INHIBITED; \ 453 /* flags kept in ke_flags */ 390 (td)->td_inhibitors |= (inhib); \ 454 #define KEF_SCHED0 0x00001 /* For scheduler-specific use. */ 391 } while (0) 455 #define KEF_SCHED1 0x00002 /* For scheduler-specific use. */ 392 456 #define KEF_SCHED2 0X00004 /* For scheduler-specific use. */ 393 #define TD_CLR_INHIB(td, inhib) do { \ 457 #define KEF_SCHED3 0x00008 /* For scheduler-specific use. */ 394 if (((td)->td_inhibitors & (inhib)) && \ 458 #define KEF_DIDRUN 0x02000 /* KSE actually ran. */ 395 (((td)->td_inhibitors &= ˜(inhib)) == 0)) \ 459 #define KEF_EXIT 0x04000 /* KSE is being killed. */ 396 (td)->td_state = TDS_CAN_RUN; \ 460 397 } while (0) 461 /* 398 462 * The upcall management structure. 399 #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) 463 * The upcall is used when returning to userland. If a thread does not have 400 #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) 464 * an upcall on return to userland the thread exports its context and exits. 401 #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) 465 */ 402 #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) 466 struct kse_upcall { 403 #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) 467 TAILQ_ENTRY(kse_upcall) ku_link; /* List of upcalls in KSEG. */ 404 #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) 468 struct ksegrp *ku_ksegrp; /* Associated KSEG. */ 405 469 struct thread *ku_owner; /* owning thread */ 406 #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) 470 int ku_flags; /* KUF_* flags. */ 407 #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) 471 struct kse_mailbox *ku_mailbox; /* userland mailbox address. * 408 #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) / 409 #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) 472 stack_t ku_stack; /* userland upcall stack. */ 410 #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) 473 void *ku_func; /* userland upcall function. * 411 / 412 #define TD_SET_RUNNING(td) do {(td)->td_state = TDS_RUNNING; } while (0) 474 unsigned int ku_mflags; /* cached upcall mailbox flags 413 #define TD_SET_RUNQ(td) do {(td)->td_state = TDS_RUNQ; } while (0) */ 414 #define TD_SET_CAN_RUN(td) do {(td)->td_state = TDS_CAN_RUN; } while (0) 475 }; 415 #define TD_SET_ON_SLEEPQ(td) do {(td)->td_flags |= TDF_ONSLEEPQ; } while (0 476 ) 477 #define KUF_DOUPCALL 0x00001 /* Do upcall now, don’t wait. */ 416 #define TD_CLR_ON_SLEEPQ(td) do { \ 478 #define KUF_EXITING 0x00002 /* Upcall structure is exiting. */ 417 (td)->td_flags &= ˜TDF_ONSLEEPQ; \ 479 418 (td)->td_wchan = NULL; \ 480 /* 419 } while (0) 481 * Kernel-scheduled entity group (KSEG). The scheduler considers each KSEG to 420 482 * be an indivisible unit from a time-sharing perspective, though each KSEG ma 421 /* y 422 * The schedulable entity that can be given a context to run. 483 * contain multiple KSEs. 423 * A process may have several of these. Probably one per processor 484 */ 424 * but posibly a few more. In this universe they are grouped 485 struct ksegrp { 425 * with a KSEG that contains the priority and niceness 486 struct proc *kg_proc; /* (*) Process that contains this KSEG 426 * for the group. . */ 427 */ 487 TAILQ_ENTRY(ksegrp) kg_ksegrp; /* (*) Queue of KSEGs in kg_proc. */ 428 struct kse { 488 TAILQ_HEAD(, kse) kg_kseq; /* (ke_kglist) All KSEs. */ 429 struct proc *ke_proc; /* (*) Associated process. */ 489 TAILQ_HEAD(, kse) kg_iq; /* (ke_kgrlist) All idle KSEs. */ 430 struct ksegrp *ke_ksegrp; /* (*) Associated KSEG. */ 490 TAILQ_HEAD(, thread) kg_threads;/* (td_kglist) All threads. */ 431 TAILQ_ENTRY(kse) ke_kglist; /* (*) Queue of KSEs in ke_ksegrp. */ 491 TAILQ_HEAD(, thread) kg_runq; /* (td_runq) waiting RUNNABLE threads 432 TAILQ_ENTRY(kse) ke_kgrlist; /* (*) Queue of KSEs in this state. */ */ 433 TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ 492 TAILQ_HEAD(, thread) kg_slpq; /* (td_runq) NONRUNNABLE threads. */ 434 493 TAILQ_HEAD(, kse_upcall) kg_upcalls; /* All upcalls in the group. * 435 #define ke_startzero ke_flags / 436 int ke_flags; /* (j) KEF_* flags. */ 494 #define kg_startzero kg_estcpu 437 struct thread *ke_thread; /* (*) Active associated thread. */ 495 u_int kg_estcpu; /* (j) Sum of the same field in KSEs. 438 fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ */ 11/15/03 15:57:19 sys/sys/proc.h 5 496 u_int kg_slptime; /* (j) How long completely blocked. */ */ 497 struct thread *kg_last_assigned; /* (j) Last thread assigned to a KS 554 LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ E. */ 555 LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ 498 int kg_runnable; /* (j) Num runnable threads on queue. 556 struct mtx p_mtx; /* (n) Lock for this struct. */ */ 557 499 int kg_runq_kses; /* (j) Num KSEs on runq. */ 558 /* The following fields are all zeroed upon creation in fork. */ 500 int kg_idle_kses; /* (j) Num KSEs on iq. */ 559 #define p_startzero p_oppid 501 int kg_numupcalls; /* (j) Num upcalls. */ 560 pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ 502 int kg_upsleeps; /* (c) Num threads in kse_release(). * 561 struct vmspace *p_vmspace; /* (b) Address space. */ / 562 u_int p_swtime; /* (j) Time swapped in or out. */ 503 struct kse_thr_mailbox *kg_completed; /* (c) Completed thread mboxes. 563 struct itimerval p_realtimer; /* (c) Alarm timer. */ */ 564 struct bintime p_runtime; /* (j) Real time. */ 504 int kg_nextupcall; /* (*) Next upcall time. */ 565 u_int64_t p_uu; /* (j) Previous user time in usec. */ 505 int kg_upquantum; /* (*) Quantum to schedule an upcall. 566 u_int64_t p_su; /* (j) Previous system time in usec. * */ / 506 #define kg_endzero kg_pri_class 567 u_int64_t p_iu; /* (j) Previous intr time in usec. */ 507 568 u_int64_t p_uticks; /* (j) Statclock hits in user mode. */ 508 #define kg_startcopy kg_endzero 569 u_int64_t p_sticks; /* (j) Statclock hits in system mode. 509 u_char kg_pri_class; /* (j) Scheduling class. */ */ 510 u_char kg_user_pri; /* (j) User pri from estcpu and nice. 570 u_int64_t p_iticks; /* (j) Statclock hits in intr. */ */ 571 int p_profthreads; /* (c) Num threads in addupc_task. */ 511 char kg_nice; /* (c + j) Process "nice" value. */ 572 int p_maxthrwaits; /* (c) Max threads num waiters */ 512 #define kg_endcopy kg_numthreads 573 int p_traceflag; /* (o) Kernel trace points. */ 513 int kg_numthreads; /* (j) Num threads in total. */ 574 struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ 514 int kg_kses; /* (j) Num KSEs in group. */ 575 struct ucred *p_tracecred; /* (o) Credentials to trace with. */ 515 struct kg_sched *kg_sched; /* (*) Scheduler-specific data. */ 576 struct vnode *p_textvp; /* (b) Vnode of executable. */ 516 }; 577 sigset_t p_siglist; /* (c) Sigs not delivered to a td. */ 517 578 char p_lock; /* (c) Proclock (prevent swap) count. 518 /* */ 519 * The old fashionned process. May have multiple threads, KSEGRPs 579 struct klist p_klist; /* (c) Knotes attached to this proc. * 520 * and KSEs. Starts off with a single embedded KSEGRP, KSE and THREAD. / 521 */ 580 struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ 522 struct proc { 581 int p_sigparent; /* (c) Signal to parent on exit. */ 523 LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ 582 int p_sig; /* (n) For core dump/debugger XXX. */ 524 TAILQ_HEAD(, ksegrp) p_ksegrps; /* (kg_ksegrp) All KSEGs. */ 583 u_long p_code; /* (n) For core dump/debugger XXX. */ 525 TAILQ_HEAD(, thread) p_threads; /* (td_plist) Threads. (shortcut) */ 584 u_int p_stops; /* (c) Stop event bitmask. */ 526 TAILQ_HEAD(, thread) p_suspended; /* (td_runq) Suspended threads. */ 585 u_int p_stype; /* (c) Stop event type. */ 527 struct ucred *p_ucred; /* (c) Process owner’s identity. */ 586 char p_step; /* (c) Process is stopped. */ 528 struct filedesc *p_fd; /* (b) Ptr to open files structure. */ 587 u_char p_pfsflags; /* (c) Procfs flags. */ 529 struct filedesc_to_leader *p_fdtol; /* (b) Ptr to tracking node */ 588 struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ 530 /* Accumulated stats for all KSEs? */ 589 void *p_aioinfo; /* (?) ASYNC I/O info. */ 531 struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ 590 struct thread *p_singlethread;/* (c + j) If single threading this is 532 struct plimit *p_limit; /* (c*) Process limits. */ it */ 533 struct vm_object *p_upages_obj; /* (a) Upages object. */ 591 int p_suspcount; /* (c) # threads in suspended mode */ 534 struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ 592 /* End area that is zeroed on creation. */ 535 593 #define p_endzero p_sigstk 536 /*struct ksegrp p_ksegrp; 594 537 struct kse p_kse; */ 595 /* The following fields are all copied upon creation in fork. */ 538 596 #define p_startcopy p_endzero 539 /* 597 stack_t p_sigstk; /* (c) Stack ptr and on-stack flag. */ 540 * The following don’t make too much sense.. 598 u_int p_magic; /* (b) Magic number. */ 541 * See the td_ or ke_ versions of the same flags 599 char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */ 542 */ 600 struct pgrp *p_pgrp; /* (c + e) Pointer to process group. * 543 int p_flag; /* (c) P_* flags. */ / 544 int p_sflag; /* (j) PS_* flags. */ 601 struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ 545 enum { 602 struct pargs *p_args; /* (c) Process arguments. */ 546 PRS_NEW = 0, /* In creation */ 603 rlim_t p_cpulimit; /* (j) Current CPU limit in seconds. * 547 PRS_NORMAL, /* KSEs can be run. */ / 548 PRS_ZOMBIE 604 /* End area that is copied on creation. */ 549 } p_state; /* (j/c) S* process status. */ 605 #define p_endcopy p_xstat 550 pid_t p_pid; /* (b) Process identifier. */ 606 551 LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ 607 u_short p_xstat; /* (c) Exit status; also stop sig. */ 552 LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. 608 int p_numthreads; /* (j) Number of threads. */ */ 609 int p_numksegrps; /* (?) number of ksegrps */ 553 struct proc *p_pptr; /* (c + e) Pointer to parent process. 610 struct mdproc p_md; /* Any machine-dependent fields. */ 11/15/03 15:57:19 sys/sys/proc.h 6 611 struct callout p_itcallout; /* (h + c) Interval timer callout. */ 672 #define SIDL 1 /* Process being created by fork. */ 612 struct user *p_uarea; /* (k) Kernel VA of u-area (CPU). */ 673 #define SRUN 2 /* Currently runnable. */ 613 u_short p_acflag; /* (c) Accounting flags. */ 674 #define SSLEEP 3 /* Sleeping on an address. */ 614 struct rusage *p_ru; /* (a) Exit information. XXX */ 675 #define SSTOP 4 /* Process or suspension. */ 615 struct proc *p_peers; /* (r) */ 676 #define SZOMB 5 /* Awaiting collection by parent. */ 616 struct proc *p_leader; /* (b) */ 677 #define SWAIT 6 /* Waiting for interrupt. */ 617 void *p_emuldata; /* (c) Emulator state data. */ 678 #define SLOCK 7 /* Blocked on a lock. */ 618 struct label *p_label; /* (*) Proc (not subject) MAC label. * 679 / 680 #define P_MAGIC 0xbeefface 619 struct p_sched *p_sched; /* (*) Scheduler-specific data. */ 681 620 }; 682 #ifdef _KERNEL 621 683 622 #define p_rlimit p_limit->pl_rlimit 684 #ifdef MALLOC_DECLARE 623 #define p_session p_pgrp->pg_session 685 MALLOC_DECLARE(M_PARGS); 624 #define p_pgid p_pgrp->pg_id 686 MALLOC_DECLARE(M_PGRP); 625 687 MALLOC_DECLARE(M_SESSION); 626 #define NOCPU 0xff /* For when we aren’t on a CPU. (SMP) */ 688 MALLOC_DECLARE(M_SUBPROC); 627 689 MALLOC_DECLARE(M_ZOMBIE); 628 /* Status values (p_stat). */ 690 #endif 629 691 630 /* These flags are kept in p_flag. */ 692 #define FOREACH_PROC_IN_SYSTEM(p) \ 631 #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ 693 LIST_FOREACH((p), &allproc, p_list) 632 #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ 694 #define FOREACH_KSEGRP_IN_PROC(p, kg) \ 633 #define P_KTHREAD 0x00004 /* Kernel thread. (*)*/ 695 TAILQ_FOREACH((kg), &(p)->p_ksegrps, kg_ksegrp) 634 #define P_NOLOAD 0x00008 /* Ignore during load avg calculations. */ 696 #define FOREACH_THREAD_IN_GROUP(kg, td) \ 635 #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. * 697 TAILQ_FOREACH((td), &(kg)->kg_threads, td_kglist) / 698 #define FOREACH_KSE_IN_GROUP(kg, ke) \ 636 #define P_PROFIL 0x00020 /* Has started profiling. */ 699 TAILQ_FOREACH((ke), &(kg)->kg_kseq, ke_kglist) 637 #define P_STOPPROF 0x00040 /* Has thread in requesting to stop prof */ 700 #define FOREACH_UPCALL_IN_GROUP(kg, ku) \ 638 #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ 701 TAILQ_FOREACH((ku), &(kg)->kg_upcalls, ku_link) 639 #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ 702 #define FOREACH_THREAD_IN_PROC(p, td) \ 640 #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. * 703 TAILQ_FOREACH((td), &(p)->p_threads, td_plist) / 704 641 #define P_TRACED 0x00800 /* Debugged process being traced. */ 705 /* XXXKSE the lines below should probably only be used in 1:1 code */ 642 #define P_WAITED 0x01000 /* Someone is waiting for us. */ 706 #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) 643 #define P_WEXIT 0x02000 /* Working on exiting. */ 707 #define FIRST_KSEGRP_IN_PROC(p) TAILQ_FIRST(&(p)->p_ksegrps) 644 #define P_EXEC 0x04000 /* Process called exec. */ 708 #define FIRST_KSE_IN_KSEGRP(kg) TAILQ_FIRST(&(kg)->kg_kseq) 645 #define P_SA 0x08000 /* Using scheduler activations. */ 709 #define FIRST_KSE_IN_PROC(p) FIRST_KSE_IN_KSEGRP(FIRST_KSEGRP_IN_PROC(p)) 646 #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ 710 647 #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ 711 /* 648 #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ 712 * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, 649 #define P_STOPPED_SINGLE 0x80000 /* Only one thread can continue */ 713 * as it is used to represent "no process group". 650 /* (not to user) */ 714 */ 651 #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ 715 #define PID_MAX 99999 652 #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ 716 #define NO_PID 100000 653 717 654 #define P_JAILED 0x1000000 /* Process is in jail. */ 718 #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) 655 #define P_ALTSTACK 0x2000000 /* Have alternate signal stack. */ 719 #define SESSHOLD(s) ((s)->s_count++) 656 #define P_INEXEC 0x4000000 /* Process is in execve(). */ 720 #define SESSRELE(s) { \ 657 721 if (--(s)->s_count == 0) \ 658 #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRAC 722 FREE(s, M_SESSION); \ E) 723 } 659 #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) 724 660 725 #define STOPEVENT(p, e, v) do { \ 661 /* These flags are kept in p_sflag and are protected with sched_lock. */ 726 PROC_LOCK(p); \ 662 #define PS_INMEM 0x00001 /* Loaded into memory. */ 727 _STOPEVENT((p), (e), (v)); \ 663 #define PS_XCPU 0x00002 /* Exceeded CPU limit. */ 728 PROC_UNLOCK(p); \ 664 #define PS_ALRMPEND 0x00020 /* Pending SIGVTALRM needs to be posted. */ 729 } while (0) 665 #define PS_PROFPEND 0x00040 /* Pending SIGPROF needs to be posted. */ 730 #define _STOPEVENT(p, e, v) do { \ 666 #define PS_SWAPINREQ 0x00100 /* Swapin request due to wakeup. */ 731 PROC_LOCK_ASSERT(p, MA_OWNED); \ 667 #define PS_SWAPPINGOUT 0x00200 /* Process is being swapped out. */ 732 if ((p)->p_stops & (e)) \ 668 #define PS_SWAPPINGIN 0x04000 /* Process is being swapped in. */ 733 stopevent((p), (e), (v)); \ 669 #define PS_MACPEND 0x08000 /* Ast()-based MAC event pending. */ 734 } while (0) 670 735 671 /* used only in legacy conversion code */ 736 /* Lock and unlock a process. */ 11/15/03 15:57:19 sys/sys/proc.h 7 737 #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) 802 extern struct sx allproc_lock; 738 #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) 803 extern struct sx proctree_lock; 739 #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) 804 extern struct mtx pargs_ref_lock; 740 #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) 805 extern struct mtx ppeers_lock; 741 #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) 806 extern struct proc proc0; /* Process slot for swapper. */ 742 807 extern struct thread thread0; /* Primary thread in proc0. */ 743 /* Lock and unlock a process group. */ 808 extern struct ksegrp ksegrp0; /* Primary ksegrp in proc0. */ 744 #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) 809 extern struct kse kse0; /* Primary kse in proc0. */ 745 #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) 810 extern struct vmspace vmspace0; /* VM space for proc0. */ 746 #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) 811 extern int hogticks; /* Limit on kernel cpu hogs. */ 747 #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) 812 extern int nprocs, maxproc; /* Current and max number of procs. */ 748 813 extern int maxprocperuid; /* Max procs per uid. */ 749 #define PGRP_LOCK_PGSIGNAL(pg) do { \ 814 extern u_long ps_arg_cache_limit; 750 if ((pg) != NULL) \ 815 extern int ps_argsopen; 751 PGRP_LOCK(pg); \ 816 extern int sched_quantum; /* Scheduling quantum in ticks. */ 752 } while (0) 817 753 #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ 818 LIST_HEAD(proclist, proc); 754 if ((pg) != NULL) \ 819 TAILQ_HEAD(procqueue, proc); 755 PGRP_UNLOCK(pg); \ 820 TAILQ_HEAD(threadqueue, thread); 756 } while (0) 821 extern struct proclist allproc; /* List of all processes. */ 757 822 extern struct proclist zombproc; /* List of zombie processes. */ 758 /* Lock and unlock a session. */ 823 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ 759 #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) 824 extern struct proc *updateproc; /* Process slot for syncer (sic). */ 760 #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) 825 761 #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) 826 extern struct uma_zone *proc_zone; 762 #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) 827 763 828 extern int lastpid; 764 /* Hold process U-area in memory, normally for ptrace/procfs work. */ 829 765 #define PHOLD(p) do { \ 830 struct proc *pfind(pid_t); /* Find process by id. */ 766 PROC_LOCK(p); \ 831 struct pgrp *pgfind(pid_t); /* Find process group by id. */ 767 _PHOLD(p); \ 832 struct proc *zpfind(pid_t); /* Find zombie process by id. */ 768 PROC_UNLOCK(p); \ 833 769 } while (0) 834 void adjustrunqueue(struct thread *, int newpri); 770 #define _PHOLD(p) do { \ 835 void ast(struct trapframe *framep); 771 PROC_LOCK_ASSERT((p), MA_OWNED); \ 836 struct thread *choosethread(void); 772 (p)->p_lock++; \ 837 int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); 773 if (((p)->p_sflag & PS_INMEM) == 0) \ 838 int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct sessio 774 faultin((p)); \ n *sess); 775 } while (0) 839 int enterthispgrp(struct proc *p, struct pgrp *pgrp); 776 840 void faultin(struct proc *p); 777 #define PRELE(p) do { \ 841 void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); 778 PROC_LOCK((p)); \ 842 int fork1(struct thread *, int, int, struct proc **); 779 _PRELE((p)); \ 843 void fork_exit(void (*)(void *, struct trapframe *), void *, 780 PROC_UNLOCK((p)); \ 844 struct trapframe *); 781 } while (0) 845 void fork_return(struct thread *, struct trapframe *); 782 #define _PRELE(p) do { \ 846 int inferior(struct proc *p); 783 PROC_LOCK_ASSERT((p), MA_OWNED); \ 847 int leavepgrp(struct proc *p); 784 (--(p)->p_lock); \ 848 void mi_switch(void); 785 } while (0) 849 int p_candebug(struct thread *td, struct proc *p); 786 850 int p_cansee(struct thread *td, struct proc *p); 787 /* Check whether a thread is safe to be swapped out. */ 851 int p_cansched(struct thread *td, struct proc *p); 788 #define thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td)) 852 int p_cansignal(struct thread *td, struct proc *p, int signum); 789 853 struct pargs *pargs_alloc(int len); 790 /* Lock and unlock process arguments. */ 854 void pargs_drop(struct pargs *pa); 791 #define PARGS_LOCK(p) mtx_lock(&pargs_ref_lock) 855 void pargs_free(struct pargs *pa); 792 #define PARGS_UNLOCK(p) mtx_unlock(&pargs_ref_lock) 856 void pargs_hold(struct pargs *pa); 793 857 void procinit(void); 794 #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) 858 void threadinit(void); 795 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; 859 void proc_linkup(struct proc *p, struct ksegrp *kg, 796 extern u_long pidhash; 860 struct kse *ke, struct thread *td); 797 861 void proc_reparent(struct proc *child, struct proc *newparent); 798 #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) 862 int securelevel_ge(struct ucred *cr, int level); 799 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; 863 int securelevel_gt(struct ucred *cr, int level); 800 extern u_long pgrphash; 864 void setrunnable(struct thread *); 801 865 void setrunqueue(struct thread *); 11/15/03 15:57:19 sys/sys/proc.h 8 866 void setsugid(struct proc *p); 930 void upcall_stash(struct kse_upcall *ke); 867 int sigonstack(size_t sp); 931 void thread_sanity_check(struct thread *td, char *); 868 void sleepinit(void); 932 void thread_stopped(struct proc *p); 869 void stopevent(struct proc *, u_int, u_int); 933 void thread_switchout(struct thread *td); 870 void cpu_idle(void); 934 void thr_exit1(void); 871 extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */ 935 #endif /* _KERNEL */ 872 void cpu_switch(struct thread *old, struct thread *new); 936 873 void cpu_throw(struct thread *old, struct thread *new) __dead2; 937 #endif /* !_SYS_PROC_H_ */ 874 void unsleep(struct thread *); 875 void userret(struct thread *, struct trapframe *, u_int); 876 877 void cpu_exit(struct thread *); 878 void cpu_sched_exit(struct thread *); 879 void exit1(struct thread *, int) __dead2; 880 void cpu_fork(struct thread *, struct proc *, struct thread *, int); 881 void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); 882 883 /* New in KSE. */ 884 struct ksegrp *ksegrp_alloc(void); 885 void ksegrp_free(struct ksegrp *kg); 886 void ksegrp_stash(struct ksegrp *kg); 887 struct kse *kse_alloc(void); 888 void kse_free(struct kse *ke); 889 void kse_stash(struct kse *ke); 890 void cpu_set_upcall(struct thread *td, struct thread *td0); 891 void cpu_set_upcall_kse(struct thread *td, struct kse_upcall *ku); 892 void cpu_thread_clean(struct thread *); 893 void cpu_thread_exit(struct thread *); 894 void cpu_thread_setup(struct thread *td); 895 void cpu_thread_siginfo(int sig, u_long code, siginfo_t *si); 896 void cpu_thread_swapin(struct thread *); 897 void cpu_thread_swapout(struct thread *); 898 void kse_reassign(struct kse *ke); 899 void kse_link(struct kse *ke, struct ksegrp *kg); 900 void kse_unlink(struct kse *ke); 901 void ksegrp_link(struct ksegrp *kg, struct proc *p); 902 void ksegrp_unlink(struct ksegrp *kg); 903 void thread_signal_add(struct thread *td, int sig); 904 struct thread *thread_alloc(void); 905 void thread_exit(void) __dead2; 906 int thread_export_context(struct thread *td, int willexit); 907 void thread_free(struct thread *td); 908 void thread_link(struct thread *td, struct ksegrp *kg); 909 void thread_reap(void); 910 struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku ); 911 int thread_single(int how); 912 #define SINGLE_NO_EXIT 0 /* values for ’how’ */ 913 #define SINGLE_EXIT 1 914 void thread_single_end(void); 915 void thread_stash(struct thread *td); 916 int thread_suspend_check(int how); 917 void thread_suspend_one(struct thread *td); 918 void thread_unlink(struct thread *td); 919 void thread_unsuspend(struct proc *p); 920 void thread_unsuspend_one(struct thread *td); 921 int thread_userret(struct thread *td, struct trapframe *frame); 922 void thread_user_enter(struct proc *p, struct thread *td); 923 void thread_wait(struct proc *p); 924 int thread_statclock(int user); 925 struct kse_upcall *upcall_alloc(void); 926 void upcall_free(struct kse_upcall *ku); 927 void upcall_link(struct kse_upcall *ku, struct ksegrp *kg); 928 void upcall_unlink(struct kse_upcall *ku); 929 void upcall_remove(struct thread *td); 11/12/03 07:07:18 sys/sys/ucred.h 1 1 /* 66 #endif /* _KERNEL || _WANT_UCRED */ 2 * Copyright (c) 1989, 1993 67 3 * The Regents of the University of California. All rights reserved. 68 /* 4 * 69 * This is the external representation of struct ucred. 5 * Redistribution and use in source and binary forms, with or without 70 */ 6 * modification, are permitted provided that the following conditions 71 struct xucred { 7 * are met: 72 u_int cr_version; /* structure layout version */ 8 * 1. Redistributions of source code must retain the above copyright 73 uid_t cr_uid; /* effective user id */ 9 * notice, this list of conditions and the following disclaimer. 74 short cr_ngroups; /* number of groups */ 10 * 2. Redistributions in binary form must reproduce the above copyright 75 gid_t cr_groups[NGROUPS]; /* groups */ 11 * notice, this list of conditions and the following disclaimer in the 76 void *_cr_unused1; /* compatibility with old ucred */ 12 * documentation and/or other materials provided with the distribution. 77 }; 13 * 3. All advertising materials mentioning features or use of this software 78 #define XUCRED_VERSION 0 14 * must display the following acknowledgement: 79 15 * This product includes software developed by the University of 80 /* This can be used for both ucred and xucred structures. */ 16 * California, Berkeley and its contributors. 81 #define cr_gid cr_groups[0] 17 * 4. Neither the name of the University nor the names of its contributors 82 18 * may be used to endorse or promote products derived from this software 83 #ifdef _KERNEL 19 * without specific prior written permission. 84 struct thread; 20 * 85 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 86 void change_egid(struct ucred *newcred, gid_t egid); 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 void change_euid(struct ucred *newcred, struct uidinfo *euip); 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 void change_rgid(struct ucred *newcred, gid_t rgid); 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 89 void change_ruid(struct ucred *newcred, struct uidinfo *ruip); 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 void change_svgid(struct ucred *newcred, gid_t svgid); 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 void change_svuid(struct ucred *newcred, uid_t svuid); 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 void crcopy(struct ucred *dest, struct ucred *src); 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 struct ucred *crdup(struct ucred *cr); 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 #ifdef DIAGNOSTIC 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 void cred_free_thread(struct thread *td); 31 * SUCH DAMAGE. 96 #endif 32 * 97 void cred_update_thread(struct thread *td); 33 * @(#)ucred.h 8.4 (Berkeley) 1/9/95 98 void crfree(struct ucred *cr); 34 * $FreeBSD: src/sys/sys/ucred.h,v 1.48 2003/11/12 15:07:18 bde Exp $ 99 struct ucred *crget(void); 35 */ 100 struct ucred *crhold(struct ucred *cr); 36 101 int crshared(struct ucred *cr); 37 #ifndef _SYS_UCRED_H_ 102 void cru2x(struct ucred *cr, struct xucred *xcr); 38 #define _SYS_UCRED_H_ 103 int groupmember(gid_t gid, struct ucred *cred); 39 104 #endif /* _KERNEL */ 40 /* 105 41 * Credentials. 106 #endif /* !_SYS_UCRED_H_ */ 42 * 43 * Please do not inspect cr_uid directly to determine superuserness. 44 * Only the suser() or suser_cred() function should be used for this. 45 */ 46 #if defined(_KERNEL) || defined(_WANT_UCRED) 47 struct ucred { 48 u_int cr_ref; /* reference count */ 49 #define cr_startcopy cr_uid 50 uid_t cr_uid; /* effective user id */ 51 uid_t cr_ruid; /* real user id */ 52 uid_t cr_svuid; /* saved user id */ 53 short cr_ngroups; /* number of groups */ 54 gid_t cr_groups[NGROUPS]; /* groups */ 55 gid_t cr_rgid; /* real group id */ 56 gid_t cr_svgid; /* saved user id */ 57 struct uidinfo *cr_uidinfo; /* per euid resource consumption */ 58 struct uidinfo *cr_ruidinfo; /* per ruid resource consumption */ 59 struct prison *cr_prison; /* jail(2) */ 60 #define cr_endcopy cr_label 61 struct label *cr_label; /* MAC label */ 62 struct mtx *cr_mtxp; /* protect refcount */ 63 }; 64 #define NOCRED ((struct ucred *)0) /* no credential available */ 65 #define FSCRED ((struct ucred *)-1) /* filesystem credential */ 05/13/03 13:36:02 sys/sys/user.h 1 1 /* 66 * 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 67 * When adding new fields to this structure, ALWAYS add them at the end 3 * The Regents of the University of California. All rights reserved. 68 * and decrease the size of the spare field by the amount of space that 4 * 69 * you are adding. Byte aligned data should be added to the ki_sparestring 5 * Redistribution and use in source and binary forms, with or without 70 * space; other entries should be added to the ki_spare space. Always 6 * modification, are permitted provided that the following conditions 71 * verify that sizeof(struct kinfo_proc) == KINFO_PROC_SIZE when you are 7 * are met: 72 * done. If you change the size of this structure, many programs will stop 8 * 1. Redistributions of source code must retain the above copyright 73 * working! Once you have added the new field, you will need to add code 9 * notice, this list of conditions and the following disclaimer. 74 * to initialize it in two places: kern/kern_proc.c in the function 10 * 2. Redistributions in binary form must reproduce the above copyright 75 * fill_kinfo_proc and in lib/libkvm/kvm_proc.c in the function kvm_proclist. 11 * notice, this list of conditions and the following disclaimer in the 76 */ 12 * documentation and/or other materials provided with the distribution. 77 #if defined(__alpha__) || defined(__ia64__) || defined(__sparc64__) || \ 13 * 3. All advertising materials mentioning features or use of this software 78 defined(__amd64__) 14 * must display the following acknowledgement: 79 #define KINFO_PROC_SIZE 912 /* the correct size for kinfo_proc */ 15 * This product includes software developed by the University of 80 #endif 16 * California, Berkeley and its contributors. 81 #ifdef __i386__ 17 * 4. Neither the name of the University nor the names of its contributors 82 #define KINFO_PROC_SIZE 648 /* the correct size for kinfo_proc */ 18 * may be used to endorse or promote products derived from this software 83 #endif 19 * without specific prior written permission. 84 #ifdef __powerpc__ 20 * 85 #define KINFO_PROC_SIZE 656 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 86 #endif 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 #ifndef KINFO_PROC_SIZE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 #error "Unknown architecture" 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 89 #endif 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 #define WMESGLEN 8 /* size of returned wchan message */ 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 #define LOCKNAMELEN 8 /* size of returned lock name */ 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 #define OCOMMLEN 16 /* size of returned ki_ocomm name */ 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 #define COMMLEN 19 /* size of returned ki_comm name */ 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 #define KI_NGROUPS 16 /* number of groups in ki_groups */ 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 #define LOGNAMELEN 17 /* size of returned ki_login */ 31 * SUCH DAMAGE. 96 32 * 97 struct kinfo_proc { 33 * @(#)user.h 8.2 (Berkeley) 9/23/93 98 int ki_structsize; /* size of this structure */ 34 * $FreeBSD: src/sys/sys/user.h,v 1.53 2003/05/13 20:36:02 jhb Exp $ 99 int ki_layout; /* reserved: layout identifier */ 35 */ 100 struct pargs *ki_args; /* address of command arguments */ 36 101 struct proc *ki_paddr; /* address of proc */ 37 #ifndef _SYS_USER_H_ 102 struct user *ki_addr; /* kernel virtual addr of u-area */ 38 #define _SYS_USER_H_ 103 struct vnode *ki_tracep; /* pointer to trace file */ 39 104 struct vnode *ki_textvp; /* pointer to executable file */ 40 #include 105 struct filedesc *ki_fd; /* pointer to open file info */ 41 #ifndef _KERNEL 106 struct vmspace *ki_vmspace; /* pointer to kernel vmspace struct */ 42 /* stuff that *used* to be included by user.h, or is now needed */ 107 void *ki_wchan; /* sleep address */ 43 #include 108 pid_t ki_pid; /* Process identifier */ 44 #include 109 pid_t ki_ppid; /* parent process id */ 45 #include 110 pid_t ki_pgid; /* process group id */ 46 #include 111 pid_t ki_tpgid; /* tty process group id */ 47 #include 112 pid_t ki_sid; /* Process session ID */ 48 #include 113 pid_t ki_tsid; /* Terminal session ID */ 49 #include 114 short ki_jobc; /* job control counter */ 50 #include 115 udev_t ki_tdev; /* controlling tty dev */ 51 #include 116 sigset_t ki_siglist; /* Signals arrived but not delivered * 52 #include /* XXX */ / 53 #include /* XXX */ 117 sigset_t ki_sigmask; /* Current signal mask */ 54 #include /* XXX */ 118 sigset_t ki_sigignore; /* Signals being ignored */ 55 #include /* XXX */ 119 sigset_t ki_sigcatch; /* Signals being caught by user */ 56 #endif /* !_KERNEL */ 120 uid_t ki_uid; /* effective user id */ 57 #ifndef _SYS_RESOURCEVAR_H_ 121 uid_t ki_ruid; /* Real user id */ 58 #include 122 uid_t ki_svuid; /* Saved effective user id */ 59 #endif 123 gid_t ki_rgid; /* Real group id */ 60 #ifndef _SYS_SIGNALVAR_H_ 124 gid_t ki_svgid; /* Saved effective group id */ 61 #include 125 short ki_ngroups; /* number of groups */ 62 #endif 126 gid_t ki_groups[KI_NGROUPS]; /* groups */ 63 127 vm_size_t ki_size; /* virtual size */ 64 /* 128 segsz_t ki_rssize; /* current resident set size in pages 65 * KERN_PROC subtype ops return arrays of selected proc structure entries: */ 05/13/03 13:36:02 sys/sys/user.h 2 129 segsz_t ki_swrss; /* resident set size before last swap */ 130 segsz_t ki_tsize; /* text size (pages) XXX */ 131 segsz_t ki_dsize; /* data size (pages) XXX */ 132 segsz_t ki_ssize; /* stack size (pages) */ 133 u_short ki_xstat; /* Exit status for wait & stop signal */ 134 u_short ki_acflag; /* Accounting flags */ 135 fixpt_t ki_pctcpu; /* %cpu for process during ki_swtime * / 136 u_int ki_estcpu; /* Time averaged value of ki_cpticks * / 137 u_int ki_slptime; /* Time since last blocked */ 138 u_int ki_swtime; /* Time swapped in or out */ 139 u_int64_t ki_runtime; /* Real time in microsec */ 140 struct timeval ki_start; /* starting time */ 141 struct timeval ki_childtime; /* time used by process children */ 142 long ki_flag; /* P_* flags */ 143 long ki_kiflag; /* KI_* flags (below) */ 144 int ki_traceflag; /* Kernel trace points */ 145 char ki_stat; /* S* process status */ 146 char ki_nice; /* Process "nice" value */ 147 char ki_lock; /* Process lock (prevent swap) count * / 148 char ki_rqindex; /* Run queue index */ 149 u_char ki_oncpu; /* Which cpu we are on */ 150 u_char ki_lastcpu; /* Last cpu we were on */ 151 char ki_ocomm[OCOMMLEN+1]; /* command name */ 152 char ki_wmesg[WMESGLEN+1]; /* wchan message */ 153 char ki_login[LOGNAMELEN+1]; /* setlogin name */ 154 char ki_lockname[LOCKNAMELEN+1]; /* lock name */ 155 char ki_comm[COMMLEN+1]; /* command name */ 156 char ki_sparestrings[85]; /* spare string space */ 157 struct rusage ki_rusage; /* process rusage statistics */ 158 long ki_sflag; /* PS_* flags */ 159 struct priority ki_pri; /* process priority */ 160 long ki_tdflags; /* XXXKSE kthread flag */ 161 struct pcb *ki_pcb; /* kernel virtual addr of pcb */ 162 void *ki_kstack; /* kernel virtual addr of stack */ 163 long ki_spare[22]; /* spare constants */ 164 }; 165 void fill_kinfo_proc(struct proc *, struct kinfo_proc *); 166 167 /* ki_sessflag values */ 168 #define KI_CTTY 0x00000001 /* controlling tty vnode active */ 169 #define KI_SLEADER 0x00000002 /* session leader */ 170 #define KI_LOCKBLOCK 0x00000004 /* proc blocked on lock ki_lockname */ 171 172 /* 173 * Per process structure containing data that isn’t needed in core 174 * when the process isn’t running (esp. when swapped out). 175 */ 176 struct user { 177 struct pstats u_stats; /* *p_stats */ 178 /* 179 * Remaining field for a.out core dumps - not valid at other times! 180 */ 181 struct kinfo_proc u_kproc; /* eproc */ 182 }; 183 184 #endif 11/11/03 19:14:31 sys/sys/vnode.h 1 1 /* 66 2 * Copyright (c) 1989, 1993 67 /* 3 * The Regents of the University of California. All rights reserved. 68 * Each underlying filesystem allocates its own private area and hangs 4 * 69 * it from v_data. If non-null, this area is freed in getnewvnode(). 5 * Redistribution and use in source and binary forms, with or without 70 */ 6 * modification, are permitted provided that the following conditions 71 TAILQ_HEAD(buflists, buf); 7 * are met: 72 8 * 1. Redistributions of source code must retain the above copyright 73 typedef int vop_t(void *); 9 * notice, this list of conditions and the following disclaimer. 74 struct namecache; 10 * 2. Redistributions in binary form must reproduce the above copyright 75 11 * notice, this list of conditions and the following disclaimer in the 76 struct vpollinfo { 12 * documentation and/or other materials provided with the distribution. 77 struct mtx vpi_lock; /* lock to protect below */ 13 * 3. All advertising materials mentioning features or use of this software 78 struct selinfo vpi_selinfo; /* identity of poller(s) */ 14 * must display the following acknowledgement: 79 short vpi_events; /* what they are looking for */ 15 * This product includes software developed by the University of 80 short vpi_revents; /* what has happened */ 16 * California, Berkeley and its contributors. 81 }; 17 * 4. Neither the name of the University nor the names of its contributors 82 18 * may be used to endorse or promote products derived from this software 83 /* 19 * without specific prior written permission. 84 * Reading or writing any of these items requires holding the appropriate lock 20 * . 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 85 * 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 86 * Lock reference: 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 87 * c - namecache mutex 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 88 * f - freelist mutex 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 89 * i - interlock 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 90 * m - mntvnodes mutex 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 91 * p - pollinfo lock 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 92 * s - spechash mutex 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 93 * S - syncer mutex 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 94 * u - Only a reference to the vnode is needed to read. 31 * SUCH DAMAGE. 95 * v - vnode lock 32 * 96 * 33 * @(#)vnode.h 8.7 (Berkeley) 2/4/94 97 * Vnodes may be found on many lists. The general way to deal with operating 34 * $FreeBSD: src/sys/sys/vnode.h,v 1.230 2003/11/12 03:14:31 rwatson Exp $ 98 * on a vnode that is on a list is: 35 */ 99 * 1) Lock the list and find the vnode. 36 100 * 2) Lock interlock so that the vnode does not go away. 37 #ifndef _SYS_VNODE_H_ 101 * 3) Unlock the list to avoid lock order reversals. 38 #define _SYS_VNODE_H_ 102 * 4) vget with LK_INTERLOCK and check for ENOENT, or 39 103 * 5) Check for XLOCK if the vnode lock is not required. 40 /* 104 * 6) Perform your operation, then vput(). 41 * XXX - compatability until lockmgr() goes away or all the #includes are 105 * 42 * updated. 106 * XXX Not all fields are locked yet and some fields that are marked are not 43 */ 107 * locked consistently. This is a work in progress. Requires Giant! 44 #include 108 */ 45 109 46 #include 110 struct vnode { 47 #include 111 struct mtx v_interlock; /* lock for "i" things */ 48 #include 112 u_long v_iflag; /* i vnode flags (see below) * 49 #include / 50 #include 113 int v_usecount; /* i ref count of users */ 51 #include 114 long v_numoutput; /* i writes in progress */ 52 #include 115 struct thread *v_vxproc; /* i thread owning VXLOCK */ 53 #include 116 int v_holdcnt; /* i page & buffer references 54 #include */ 55 117 struct buflists v_cleanblkhd; /* i SORTED clean blocklist */ 56 /* 118 struct buf *v_cleanblkroot; /* i clean buf splay tree */ 57 * The vnode is the focus of all file activity in UNIX. There is a 119 int v_cleanbufcnt; /* i number of clean buffers * 58 * unique vnode allocated for each active file, each current directory, / 59 * each mounted-on file, text file, and the root. 120 struct buflists v_dirtyblkhd; /* i SORTED dirty blocklist */ 60 */ 121 struct buf *v_dirtyblkroot; /* i dirty buf splay tree */ 61 122 int v_dirtybufcnt; /* i number of dirty buffers * 62 /* / 63 * Vnode types. VNON means no type. 123 u_long v_vflag; /* v vnode flags */ 64 */ 124 int v_writecount; /* v ref count of writers */ 65 enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD }; 125 struct vm_object *v_object; /* v Place to store VM object 11/11/03 19:14:31 sys/sys/vnode.h 2 */ 179 u_long xv_id; /* capability identifier */ 126 daddr_t v_lastw; /* v last write (write cluster 180 void *xv_mount; /* address of parent mount */ ) */ 181 long xv_numoutput; /* num of writes in progress * 127 daddr_t v_cstart; /* v start block of cluster */ / 128 daddr_t v_lasta; /* v last allocation (cluster) 182 enum vtype xv_type; /* vnode type */ */ 183 union { 129 int v_clen; /* v length of current cluster 184 void *xvu_socket; /* socket, if VSOCK */ */ 185 void *xvu_fifo; /* fifo, if VFIFO */ 130 union { 186 udev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ 131 struct mount *vu_mountedhere;/* v ptr to mounted vfs (VDIR) 187 struct { */ 188 udev_t xvu_dev; /* device, if VDIR/VREG/VLNK * 132 struct socket *vu_socket; /* v unix ipc (VSOCK) */ / 133 struct { 189 ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ 134 struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ 190 } xv_uns; 135 SLIST_ENTRY(vnode) vu_specnext; /* s device aliases */ 191 } xv_un; 136 } vu_spec; 192 }; 137 struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ 193 #define xv_socket xv_un.xvu_socket 138 } v_un; 194 #define xv_fifo xv_un.xvu_fifo 139 TAILQ_ENTRY(vnode) v_freelist; /* f vnode freelist */ 195 #define xv_rdev xv_un.xvu_rdev 140 TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ 196 #define xv_dev xv_un.xv_uns.xvu_dev 141 LIST_ENTRY(vnode) v_synclist; /* S dirty vnode list */ 197 #define xv_ino xv_un.xv_uns.xvu_ino 142 enum vtype v_type; /* u vnode type */ 198 143 const char *v_tag; /* u type of underlying data * 199 #define VN_POLLEVENT(vp, events) \ / 200 do { \ 144 void *v_data; /* u private data for fs */ 201 if ((vp)->v_pollinfo != NULL && \ 145 struct lock v_lock; /* u used if fs don’t have one 202 (vp)->v_pollinfo->vpi_events & (events)) \ */ 203 vn_pollevent((vp), (events)); \ 146 struct lock *v_vnlock; /* u pointer to vnode lock */ 204 } while (0) 147 vop_t **v_op; /* u vnode operations vector * 205 / 206 #define VN_KNOTE(vp, b) \ 148 struct mount *v_mount; /* u ptr to vfs we are in */ 207 do { \ 149 LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ 208 if ((vp)->v_pollinfo != NULL) \ 150 TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ 209 KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b)); \ 151 u_long v_id; /* c capability identifier */ 210 } while (0) 152 struct vnode *v_dd; /* c .. vnode */ 211 153 u_long v_ddid; /* c .. capability identifier 212 /* */ 213 * Vnode flags. 154 struct vpollinfo *v_pollinfo; /* p Poll events */ 214 * VI flags are protected by interlock and live in v_iflag 155 struct label *v_label; /* MAC label for vnode */ 215 * VV flags are protected by the vnode lock and live in v_vflag 156 #ifdef DEBUG_LOCKS 216 */ 157 const char *filename; /* Source file doing locking * 217 #define VI_XLOCK 0x0001 /* vnode is locked to change vtype */ / 218 #define VI_XWANT 0x0002 /* thread is waiting for vnode */ 158 int line; /* Line number doing locking * 219 #define VI_BWAIT 0x0004 /* waiting for output to complete */ / 220 #define VI_OLOCK 0x0008 /* vnode is locked waiting for an object */ 159 #endif 221 #define VI_OWANT 0x0010 /* a thread is waiting for VOLOCK */ 160 udev_t v_cachedfs; /* cached fs id */ 222 #define VI_MOUNT 0x0020 /* Mount in progress */ 161 ino_t v_cachedid; /* cached file id */ 223 #define VI_AGE 0x0040 /* Insert vnode at head of free list */ 162 }; 224 #define VI_DOOMED 0x0080 /* This vnode is being recycled */ 163 #define v_mountedhere v_un.vu_mountedhere 225 #define VI_FREE 0x0100 /* This vnode is on the freelist */ 164 #define v_socket v_un.vu_socket 226 #define VI_OBJDIRTY 0x0400 /* object might be dirty */ 165 #define v_rdev v_un.vu_spec.vu_cdev 227 #define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */ 166 #define v_specnext v_un.vu_spec.vu_specnext 228 /* 167 #define v_fifoinfo v_un.vu_fifoinfo 229 * XXX VI_ONWORKLST could be replaced with a check for NULL list elements 168 230 * in v_synclist. 169 /* 231 */ 170 * Userland version of struct vnode, for sysctl. 232 #define VI_ONWORKLST 0x0200 /* On syncer work-list */ 171 */ 233 172 struct xvnode { 234 #define VV_ROOT 0x0001 /* root of its filesystem */ 173 size_t xv_size; /* sizeof(struct xvnode) */ 235 #define VV_ISTTY 0x0002 /* vnode represents a tty */ 174 void *xv_vnode; /* address of real vnode */ 236 #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ 175 u_long xv_flag; /* vnode vflags */ 237 #define VV_OBJBUF 0x0008 /* Allocate buffers in VM object */ 176 int xv_usecount; /* reference count of users */ 238 #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ 177 int xv_writecount; /* reference count of writers 239 #define VV_TEXT 0x0020 /* vnode is a pure text prototype */ */ 240 #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ 178 int xv_holdcnt; /* page & buffer references */ 241 #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ 11/11/03 19:14:31 sys/sys/vnode.h 3 242 #define VV_PROCDEP 0x0100 /* vnode is process dependent */ 306 #define VADMIN 010000 /* permission to administer */ 243 307 #define VSTAT 020000 /* permission to retrieve attrs */ 244 /* 308 #define VAPPEND 040000 /* permission to write/append */ 245 * Vnode attributes. A field value of VNOVAL represents a field whose value 309 #define VALLPERM (VEXEC | VWRITE | VREAD | VADMIN | VSTAT | VAPPEND) 246 * is unavailable (getattr) or which is not to be changed (setattr). 310 247 */ 311 /* 248 struct vattr { 312 * Token indicating no attribute value yet assigned. 249 enum vtype va_type; /* vnode type (for create) */ 313 */ 250 u_short va_mode; /* files access mode and type */ 314 #define VNOVAL (-1) 251 short va_nlink; /* number of references to file */ 315 252 uid_t va_uid; /* owner user id */ 316 /* 253 gid_t va_gid; /* owner group id */ 317 * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) 254 udev_t va_fsid; /* filesystem id */ 318 */ 255 long va_fileid; /* file id */ 319 #define VLKTIMEOUT (hz / 20 + 1) 256 u_quad_t va_size; /* file size in bytes */ 320 257 long va_blocksize; /* blocksize preferred for i/o */ 321 #ifdef _KERNEL 258 struct timespec va_atime; /* time of last access */ 322 259 struct timespec va_mtime; /* time of last modification */ 323 #ifdef MALLOC_DECLARE 260 struct timespec va_ctime; /* time file changed */ 324 MALLOC_DECLARE(M_VNODE); 261 struct timespec va_birthtime; /* time file created */ 325 #endif 262 u_long va_gen; /* generation number of file */ 326 263 u_long va_flags; /* flags defined for file */ 327 /* 264 udev_t va_rdev; /* device the special file represents 328 * Convert between vnode types and inode formats (since POSIX.1 */ 329 * defines mode word of stat structure in terms of inode formats). 265 u_quad_t va_bytes; /* bytes of disk space held by file */ 330 */ 266 u_quad_t va_filerev; /* file modification number */ 331 extern enum vtype iftovt_tab[]; 267 u_int va_vaflags; /* operations flags, see below */ 332 extern int vttoif_tab[]; 268 long va_spare; /* remain quad aligned */ 333 #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) 269 }; 334 #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) 270 335 #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) 271 /* 336 272 * Flags for va_vaflags. 337 /* 273 */ 338 * Flags to various vnode functions. 274 #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ 339 */ 275 #define VA_EXCLUSIVE 0x02 /* exclusive create request */ 340 #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ 276 341 #define FORCECLOSE 0x0002 /* vflush: force file closure */ 277 /* 342 #define WRITECLOSE 0x0004 /* vflush: only close writable files */ 278 * Flags for ioflag. (high 16 bits used to ask for read-ahead and 343 #define DOCLOSE 0x0008 /* vclean: close active files */ 279 * help with write clustering) 344 #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ 280 */ 345 #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs * 281 #define IO_UNIT 0x0001 /* do I/O as atomic unit */ / 282 #define IO_APPEND 0x0002 /* append write to end */ 346 #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ 283 #define IO_SYNC 0x0004 /* do I/O synchronously */ 347 #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ 284 #define IO_NODELOCKED 0x0008 /* underlying node already locked */ 348 #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ 285 #define IO_NDELAY 0x0010 /* FNDELAY flag set in file table */ 349 #define V_NOWAIT 0x0002 /* vn_start_write: don’t sleep for suspend */ 286 #define IO_VMIO 0x0020 /* data already in VMIO space */ 350 #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ 287 #define IO_INVAL 0x0040 /* invalidate after I/O */ 351 288 #define IO_ASYNC 0x0080 /* bawrite rather then bdwrite */ 352 #define VREF(vp) vref(vp) 289 #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ 353 290 #define IO_EXT 0x0400 /* operate on external attributes */ 354 291 #define IO_NORMAL 0x0800 /* operate on regular data */ 355 #ifdef DIAGNOSTIC 292 #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ 356 #define VATTR_NULL(vap) vattr_null(vap) 293 357 #else 294 #define IO_SEQMAX 0x7F /* seq heuristic max value */ 358 #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ 295 #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ 359 #endif /* DIAGNOSTIC */ 296 360 297 /* 361 #define NULLVP ((struct vnode *)NULL) 298 * Modes. Some values same as Ixxx entries from inode.h for now. 362 299 */ 363 #define VNODEOP_SET(f) \ 300 #define VEXEC 000100 /* execute/search permission */ 364 C_SYSINIT(f##init, SI_SUB_VFS, SI_ORDER_SECOND, vfs_add_vnodeops, &f); 301 #define VWRITE 000200 /* write permission */ \ 302 #define VREAD 000400 /* read permission */ 365 C_SYSUNINIT(f##uninit, SI_SUB_VFS, SI_ORDER_SECOND, vfs_rm_vnodeops, & 303 #define VSVTX 001000 /* save swapped text even after use */ f); 304 #define VSGID 002000 /* set group id on execution */ 366 305 #define VSUID 004000 /* set user id on execution */ 367 /* 11/11/03 19:14:31 sys/sys/vnode.h 4 368 * Global vnode data. 432 369 */ 433 /* 370 extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ 434 * This structure describes the vnode operation taking place. 371 extern int async_io_version; /* 0 or POSIX version of AIO i’face */ 435 */ 372 extern int desiredvnodes; /* number of vnodes desired */ 436 struct vnodeop_desc { 373 extern struct uma_zone *namei_zone; 437 int vdesc_offset; /* offset in vector,first for speed */ 374 extern int prtactive; /* nonzero to call vprint() */ 438 char *vdesc_name; /* a readable name for debugging */ 375 extern struct vattr va_null; /* predefined null vattr structure */ 439 int vdesc_flags; /* VDESC_* flags */ 376 440 377 /* 441 /* 378 * Macro/function to check for client cache inconsistency w.r.t. leasing. 442 * These ops are used by bypass routines to map and locate arguments. 379 */ 443 * Creds and procs are not needed in bypass routines, but sometimes 380 #define LEASE_READ 0x1 /* Check lease for readers */ 444 * they are useful to (for example) transport layers. 381 #define LEASE_WRITE 0x2 /* Check lease for modifiers */ 445 * Nameidata is useful because it has a cred in it. 382 446 */ 383 447 int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ 384 extern void (*lease_updatetime)(int deltat); 448 int vdesc_vpp_offset; /* return vpp location */ 385 449 int vdesc_cred_offset; /* cred location, if any */ 386 /* Requires interlock */ 450 int vdesc_thread_offset; /* thread location, if any */ 387 #define VSHOULDFREE(vp) \ 451 int vdesc_componentname_offset; /* if any */ 388 (!((vp)->v_iflag & (VI_FREE|VI_DOOMED|VI_DOINGINACT)) && \ 452 /* 389 !(vp)->v_holdcnt && !(vp)->v_usecount && \ 453 * Finally, we’ve got a list of private data (about each operation) 390 (!(vp)->v_object || \ 454 * for each transport layer. (Support to manage this list is not 391 !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count)) 455 * yet part of BSD.) ) 456 */ 392 457 caddr_t *vdesc_transports; 393 /* Requires interlock */ 458 }; 394 #define VMIGHTFREE(vp) \ 459 395 (!((vp)->v_iflag & (VI_FREE|VI_DOOMED|VI_XLOCK|VI_DOINGINACT)) && \ 460 #ifdef _KERNEL 396 LIST_EMPTY(&(vp)->v_cache_src) && !(vp)->v_usecount) 461 /* 397 462 * A list of all the operation descs. 398 /* Requires interlock */ 463 */ 399 #define VSHOULDBUSY(vp) \ 464 extern struct vnodeop_desc *vnodeop_descs[]; 400 (((vp)->v_iflag & VI_FREE) && \ 465 401 ((vp)->v_holdcnt || (vp)->v_usecount)) 466 #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) 402 467 #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ 403 #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) 468 ((s_type)(((char*)(struct_p)) + (s_offset))) 404 #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) 469 405 #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) 470 /* 406 #define VI_MTX(vp) (&(vp)->v_interlock) 471 * This structure is used to configure the new vnodeops vector. 407 472 */ 408 #endif /* _KERNEL */ 473 struct vnodeopv_entry_desc { 409 474 struct vnodeop_desc *opve_op; /* which operation this is */ 410 475 vop_t *opve_impl; /* code implementing this operation */ 411 /* 476 }; 412 * Mods for extensibility. 477 struct vnodeopv_desc { 413 */ 478 /* ptr to the ptr to the vector where op should go */ 414 479 vop_t ***opv_desc_vector_p; 415 /* 480 struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ 416 * Flags for vdesc_flags: 481 }; 417 */ 482 418 #define VDESC_MAX_VPS 16 483 /* 419 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ 484 * A generic structure. 420 #define VDESC_VP0_WILLRELE 0x0001 485 * This can be used by bypass routines to identify generic arguments. 421 #define VDESC_VP1_WILLRELE 0x0002 486 */ 422 #define VDESC_VP2_WILLRELE 0x0004 487 struct vop_generic_args { 423 #define VDESC_VP3_WILLRELE 0x0008 488 struct vnodeop_desc *a_desc; 424 #define VDESC_NOMAP_VPP 0x0100 489 /* other random data follows, presumably */ 425 #define VDESC_VPP_WILLRELE 0x0200 490 }; 426 491 427 /* 492 /* 428 * VDESC_NO_OFFSET is used to identify the end of the offset list 493 * Support code to aid in debugging VFS locking problems. Not totally 429 * and in places where no such field exists. 494 * reliable since if the thread sleeps between changing the lock 430 */ 495 * state and checking it with the assert, some other thread could 431 #define VDESC_NO_OFFSET -1 496 * change the state. They are good enough for debugging a single 11/11/03 19:14:31 sys/sys/vnode.h 5 497 * filesystem using a single-threaded test. 562 if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) 498 */ 563 return(TRUE); 499 void assert_vi_locked(struct vnode *vp, const char *str); 564 return(FALSE); 500 void assert_vi_unlocked(struct vnode *vp, const char *str); 565 } 501 void assert_vop_unlocked(struct vnode *vp, const char *str); 566 502 void assert_vop_locked(struct vnode *vp, const char *str); 567 /* 503 void assert_vop_slocked(struct vnode *vp, const char *str); 568 * Finally, include the default set of vnode operations. 504 void assert_vop_elocked(struct vnode *vp, const char *str); 569 */ 505 void assert_vop_elocked_other(struct vnode *vp, const char *str); 570 #include "vnode_if.h" 506 571 507 /* These are called from within the actuall VOPS */ 572 /* 508 void vop_rename_pre(void *a); 573 * Public vnode manipulation functions. 509 void vop_strategy_pre(void *a); 574 */ 510 void vop_lookup_pre(void *a); 575 struct componentname; 511 void vop_lookup_post(void *a, int rc); 576 struct file; 512 void vop_lock_pre(void *a); 577 struct mount; 513 void vop_lock_post(void *a, int rc); 578 struct nameidata; 514 void vop_unlock_pre(void *a); 579 struct ostat; 515 void vop_unlock_post(void *a, int rc); 580 struct thread; 516 581 struct proc; 517 #ifdef DEBUG_VFS_LOCKS 582 struct stat; 518 583 struct nstat; 519 #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) 584 struct ucred; 520 #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) 585 struct uio; 521 #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) 586 struct vattr; 522 #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) 587 struct vnode; 523 #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) 588 524 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str)) 589 extern int (*lease_check_hook)(struct vop_lease_args *); 525 #define ASSERT_VOP_SLOCKED(vp, str) assert_vop_slocked((vp), (str)) 590 extern int (*softdep_fsync_hook)(struct vnode *); 526 591 extern int (*softdep_process_worklist_hook)(struct mount *); 527 #else 592 528 593 struct vnode *addaliasu(struct vnode *vp, udev_t nvp_rdev); 529 #define ASSERT_VOP_LOCKED(vp, str) 594 int bdevvp(dev_t dev, struct vnode **vpp); 530 #define ASSERT_VOP_UNLOCKED(vp, str) 595 /* cache_* may belong in namei.h. */ 531 #define ASSERT_VOP_ELOCKED(vp, str) 596 void cache_enter(struct vnode *dvp, struct vnode *vp, 532 #define ASSERT_VOP_ELOCKED_OTHER(vp, str) 597 struct componentname *cnp); 533 #define ASSERT_VOP_SLOCKED(vp, str) 598 int cache_lookup(struct vnode *dvp, struct vnode **vpp, 534 #define ASSERT_VI_UNLOCKED(vp, str) 599 struct componentname *cnp); 535 #define ASSERT_VI_LOCKED(vp, str) 600 void cache_purge(struct vnode *vp); 536 601 void cache_purgevfs(struct mount *mp); 537 #endif 602 int cache_leaf_test(struct vnode *vp); 538 603 int change_dir(struct vnode *vp, struct thread *td); 539 /* 604 int change_root(struct vnode *vp, struct thread *td); 540 * VOCALL calls an op given an ops vector. We break it out because BSD’s 605 void cvtstat(struct stat *st, struct ostat *ost); 541 * vclean changes the ops vector and then wants to call ops with the old 606 void cvtnstat(struct stat *sb, struct nstat *nsb); 542 * vector. 607 int getnewvnode(const char *tag, struct mount *mp, vop_t **vops, 543 */ 608 struct vnode **vpp); 544 #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) 609 int lease_check(struct vop_lease_args *ap); 545 610 int spec_vnoperate(struct vop_generic_args *); 546 /* 611 int speedup_syncer(void); 547 * This call works for vnodes in the kernel. 612 #define textvp_fullpath(p, rb, rfb) \ 548 */ 613 vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb) 549 #define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) 614 int vn_fullpath(struct thread *td, struct vnode *vn, 550 #define VDESC(OP) (& __CONCAT(OP,_desc)) 615 char **retbuf, char **freebuf); 551 #define VOFFSET(OP) (VDESC(OP)->vdesc_offset) 616 int vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid, 552 617 mode_t acc_mode, struct ucred *cred, int *privused); 553 /* 618 int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, 554 * VMIO support inline 619 gid_t file_gid, struct acl *acl, mode_t acc_mode, 555 */ 620 struct ucred *cred, int *privused); 556 621 void vattr_null(struct vattr *vap); 557 extern int vmiodirenable; 622 int vcount(struct vnode *vp); 558 623 void vdrop(struct vnode *); 559 static __inline int 624 void vdropl(struct vnode *); 560 vn_canvmio(struct vnode *vp) 625 int vfinddev(dev_t dev, enum vtype type, struct vnode **vpp); 561 { 626 void vfs_add_vnodeops(const void *); 11/11/03 19:14:31 sys/sys/vnode.h 6 627 void vfs_rm_vnodeops(const void *); 692 int vop_stdunlock(struct vop_unlock_args *); 628 int vflush(struct mount *mp, int rootrefs, int flags); 693 int vop_noislocked(struct vop_islocked_args *); 629 int vget(struct vnode *vp, int lockflag, struct thread *td); 694 int vop_nolock(struct vop_lock_args *); 630 void vgone(struct vnode *vp); 695 int vop_nopoll(struct vop_poll_args *); 631 void vgonel(struct vnode *vp, struct thread *td); 696 int vop_nounlock(struct vop_unlock_args *); 632 void vhold(struct vnode *); 697 int vop_stdpathconf(struct vop_pathconf_args *); 633 void vholdl(struct vnode *); 698 int vop_stdpoll(struct vop_poll_args *); 634 int vinvalbuf(struct vnode *vp, int save, struct ucred *cred, 699 int vop_revoke(struct vop_revoke_args *); 635 struct thread *td, int slpflag, int slptimeo); 700 int vop_sharedlock(struct vop_lock_args *); 636 int vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, 701 int vop_eopnotsupp(struct vop_generic_args *ap); 637 off_t length, int blksize); 702 int vop_ebadf(struct vop_generic_args *ap); 638 void vprint(char *label, struct vnode *vp); 703 int vop_einval(struct vop_generic_args *ap); 639 int vrecycle(struct vnode *vp, struct mtx *inter_lkp, 704 int vop_enotty(struct vop_generic_args *ap); 640 struct thread *td); 705 int vop_defaultop(struct vop_generic_args *ap); 641 int vn_close(struct vnode *vp, 706 int vop_null(struct vop_generic_args *ap); 642 int flags, struct ucred *file_cred, struct thread *td); 707 int vop_panic(struct vop_generic_args *ap); 643 void vn_finished_write(struct mount *mp); 708 int vop_stdcreatevobject(struct vop_createvobject_args *ap); 644 int vn_isdisk(struct vnode *vp, int *errp); 709 int vop_stddestroyvobject(struct vop_destroyvobject_args *ap); 645 int vn_lock(struct vnode *vp, int flags, struct thread *td); 710 int vop_stdgetvobject(struct vop_getvobject_args *ap); 646 #ifdef DEBUG_LOCKS 711 647 int debug_vn_lock(struct vnode *vp, int flags, struct thread *p, 712 void vfree(struct vnode *); 648 const char *filename, int line); 713 void vput(struct vnode *vp); 649 #define vn_lock(vp,flags,p) debug_vn_lock(vp,flags,p,__FILE__,__LINE__) 714 void vrele(struct vnode *vp); 650 #endif 715 void vref(struct vnode *vp); 651 int vn_open(struct nameidata *ndp, int *flagp, int cmode, int fdidx); 716 int vrefcnt(struct vnode *vp); 652 int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, 717 void vbusy(struct vnode *vp); 653 struct ucred *cred, int fdidx); 718 void v_addpollinfo(struct vnode *vp); 654 void vn_pollevent(struct vnode *vp, int events); 719 655 void vn_pollgone(struct vnode *vp); 720 extern vop_t **default_vnodeop_p; 656 int vn_pollrecord(struct vnode *vp, struct thread *p, int events); 721 extern vop_t **spec_vnodeop_p; 657 int vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, 722 extern vop_t **dead_vnodeop_p; 658 int len, off_t offset, enum uio_seg segflg, int ioflg, 723 659 struct ucred *active_cred, struct ucred *file_cred, int *aresid, 724 #endif /* _KERNEL */ 660 struct thread *td); 725 661 int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, 726 #endif /* !_SYS_VNODE_H_ */ 662 int len, off_t offset, enum uio_seg segflg, int ioflg, 663 struct ucred *active_cred, struct ucred *file_cred, int *aresid, 664 struct thread *td); 665 int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, 666 struct ucred *file_cred, struct thread *td); 667 int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); 668 dev_t vn_todev(struct vnode *vp); 669 int vn_write_suspend_wait(struct vnode *vp, struct mount *mp, 670 int flags); 671 int vn_writechk(struct vnode *vp); 672 int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, 673 const char *attrname, int *buflen, char *buf, struct thread *td); 674 int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, 675 const char *attrname, int buflen, char *buf, struct thread *td); 676 int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, 677 const char *attrname, struct thread *td); 678 int vfs_cache_lookup(struct vop_lookup_args *ap); 679 int vfs_object_create(struct vnode *vp, struct thread *td, 680 struct ucred *cred); 681 void vfs_timestamp(struct timespec *); 682 void vfs_write_resume(struct mount *mp); 683 int vfs_write_suspend(struct mount *mp); 684 int vop_stdbmap(struct vop_bmap_args *); 685 int vop_stdfsync(struct vop_fsync_args *); 686 int vop_stdgetwritemount(struct vop_getwritemount_args *); 687 int vop_stdgetpages(struct vop_getpages_args *); 688 int vop_stdinactive(struct vop_inactive_args *); 689 int vop_stdislocked(struct vop_islocked_args *); 690 int vop_stdlock(struct vop_lock_args *); 691 int vop_stdputpages(struct vop_putpages_args *); 07/28/03 11:53:28 sys/kern/vnode_if.src 1 1 # 66 vop_islocked { 2 # Copyright (c) 1992, 1993 67 IN struct vnode *vp; 3 # The Regents of the University of California. All rights reserved. 68 IN struct thread *td; 4 # 69 }; 5 # Redistribution and use in source and binary forms, with or without 70 6 # modification, are permitted provided that the following conditions 71 # 7 # are met: 72 # lookup dvp L ? ? 8 # 1. Redistributions of source code must retain the above copyright 73 # lookup vpp - L - 9 # notice, this list of conditions and the following disclaimer. 74 #! lookup pre vop_lookup_pre 10 # 2. Redistributions in binary form must reproduce the above copyright 75 #! lookup post vop_lookup_post 11 # notice, this list of conditions and the following disclaimer in the 76 # 12 # documentation and/or other materials provided with the distribution. 77 # XXX - the lookup locking protocol defies simple description and depends 13 # 3. All advertising materials mentioning features or use of this software 78 # on the flags and operation fields in the (cnp) structure. Note 14 # must display the following acknowledgement: 79 # especially that *vpp may equal dvp and both may be locked. 15 # This product includes software developed by the University of 80 # 16 # California, Berkeley and its contributors. 81 vop_lookup { 17 # 4. Neither the name of the University nor the names of its contributors 82 IN struct vnode *dvp; 18 # may be used to endorse or promote products derived from this software 83 INOUT struct vnode **vpp; 19 # without specific prior written permission. 84 IN struct componentname *cnp; 20 # 85 }; 21 # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 86 22 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 # 23 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 #% cachedlookup dvp L ? ? 24 # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 89 #% cachedlookup vpp - L - 25 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 # 26 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 # This must be an exact copy of lookup. See kern/vfs_cache.c for details. 27 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 # 28 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 vop_cachedlookup { 29 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 IN struct vnode *dvp; 30 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 INOUT struct vnode **vpp; 31 # SUCH DAMAGE. 96 IN struct componentname *cnp; 32 # 97 }; 33 # @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 98 34 # $FreeBSD: src/sys/kern/vnode_if.src,v 1.66 2003/07/28 18:53:28 rwatson Exp $ 99 # 35 # 100 #% create dvp L L L 36 101 #% create vpp - L - 37 # 102 # 38 # Above each of the vop descriptors is a specification of the locking 103 vop_create { 39 # protocol used by each vop call. The first column is the name of 104 IN struct vnode *dvp; 40 # the variable, the remaining three columns are in, out and error 105 OUT struct vnode **vpp; 41 # respectively. The "in" column defines the lock state on input, 106 IN struct componentname *cnp; 42 # the "out" column defines the state on succesful return, and the 107 IN struct vattr *vap; 43 # "error" column defines the locking state on error exit. 108 }; 44 # 109 45 # The locking value can take the following values: 110 # 46 # L: locked; not converted to type of lock. 111 #% whiteout dvp L L L 47 # A: any lock type. 112 # 48 # S: locked with shared lock. 113 vop_whiteout { 49 # E: locked with exclusive lock for this process. 114 IN struct vnode *dvp; 50 # O: locked with exclusive lock for other process. 115 IN struct componentname *cnp; 51 # U: unlocked. 116 IN int flags; 52 # -: not applicable. vnode does not yet (or no longer) exists. 117 }; 53 # =: the same on input and output, may be either L or U. 118 54 # X: locked if not nil. 119 # 55 # 120 #% mknod dvp L L L 56 # The paramater named "vpp" is assumed to be always used with double 121 #% mknod vpp - L - 57 # indirection (**vpp) and that name is hard-codeed in vnode_if.awk ! 122 # 58 # 123 vop_mknod { 59 # If other such parameters are introduced, they have to be added to 124 IN struct vnode *dvp; 60 # the AWK script at the head of the definition of "add_debug_code()". 125 OUT struct vnode **vpp; 61 # 126 IN struct componentname *cnp; 62 127 IN struct vattr *vap; 63 # 128 }; 64 # islocked vp = = = 129 65 # 130 # 07/28/03 11:53:28 sys/kern/vnode_if.src 2 131 #% open vp L L L 196 INOUT struct uio *uio; 132 # 197 IN int ioflag; 133 vop_open { 198 IN struct ucred *cred; 134 IN struct vnode *vp; 199 }; 135 IN int mode; 200 136 IN struct ucred *cred; 201 # 137 IN struct thread *td; 202 #% lease vp = = = 138 IN int fdidx; 203 # 139 }; 204 vop_lease { 140 205 IN struct vnode *vp; 141 # 206 IN struct thread *td; 142 #% close vp U U U 207 IN struct ucred *cred; 143 # 208 IN int flag; 144 vop_close { 209 }; 145 IN struct vnode *vp; 210 146 IN int fflag; 211 # 147 IN struct ucred *cred; 212 #% ioctl vp U U U 148 IN struct thread *td; 213 # 149 }; 214 vop_ioctl { 150 215 IN struct vnode *vp; 151 # 216 IN u_long command; 152 #% access vp L L L 217 IN caddr_t data; 153 # 218 IN int fflag; 154 vop_access { 219 IN struct ucred *cred; 155 IN struct vnode *vp; 220 IN struct thread *td; 156 IN int mode; 221 }; 157 IN struct ucred *cred; 222 158 IN struct thread *td; 223 # 159 }; 224 #% poll vp U U U 160 225 # 161 # 226 vop_poll { 162 #% getattr vp L L L 227 IN struct vnode *vp; 163 # 228 IN int events; 164 vop_getattr { 229 IN struct ucred *cred; 165 IN struct vnode *vp; 230 IN struct thread *td; 166 OUT struct vattr *vap; 231 }; 167 IN struct ucred *cred; 232 168 IN struct thread *td; 233 # 169 }; 234 #% kqfilter vp U U U 170 235 # 171 # 236 vop_kqfilter { 172 #% setattr vp L L L 237 IN struct vnode *vp; 173 # 238 IN struct knote *kn; 174 vop_setattr { 239 }; 175 IN struct vnode *vp; 240 176 IN struct vattr *vap; 241 # 177 IN struct ucred *cred; 242 #% revoke vp U U U 178 IN struct thread *td; 243 # 179 }; 244 vop_revoke { 180 245 IN struct vnode *vp; 181 # 246 IN int flags; 182 #% read vp L L L 247 }; 183 # 248 184 vop_read { 249 # 185 IN struct vnode *vp; 250 #% fsync vp L L L 186 INOUT struct uio *uio; 251 # 187 IN int ioflag; 252 vop_fsync { 188 IN struct ucred *cred; 253 IN struct vnode *vp; 189 }; 254 IN struct ucred *cred; 190 255 IN int waitfor; 191 # 256 IN struct thread *td; 192 #% write vp L L L 257 }; 193 # 258 194 vop_write { 259 # 195 IN struct vnode *vp; 260 #% remove dvp L L L 07/28/03 11:53:28 sys/kern/vnode_if.src 3 261 #% remove vp L L L 326 }; 262 # 327 263 vop_remove { 328 # 264 IN struct vnode *dvp; 329 #% readdir vp L L L 265 IN struct vnode *vp; 330 # 266 IN struct componentname *cnp; 331 vop_readdir { 267 }; 332 IN struct vnode *vp; 268 333 INOUT struct uio *uio; 269 # 334 IN struct ucred *cred; 270 #% link tdvp L L L 335 INOUT int *eofflag; 271 #% link vp L L L 336 OUT int *ncookies; 272 # 337 INOUT u_long **cookies; 273 vop_link { 338 }; 274 IN struct vnode *tdvp; 339 275 IN struct vnode *vp; 340 # 276 IN struct componentname *cnp; 341 #% readlink vp L L L 277 }; 342 # 278 343 vop_readlink { 279 # 344 IN struct vnode *vp; 280 # rename fdvp U U U 345 INOUT struct uio *uio; 281 # rename fvp U U U 346 IN struct ucred *cred; 282 # rename tdvp L U U 347 }; 283 # rename tvp X U U 348 284 #! rename pre vop_rename_pre 349 # 285 # 350 #% inactive vp L U U 286 vop_rename { 351 # 287 IN WILLRELE struct vnode *fdvp; 352 vop_inactive { 288 IN WILLRELE struct vnode *fvp; 353 IN struct vnode *vp; 289 IN struct componentname *fcnp; 354 IN struct thread *td; 290 IN WILLRELE struct vnode *tdvp; 355 }; 291 IN WILLRELE struct vnode *tvp; 356 292 IN struct componentname *tcnp; 357 # 293 }; 358 #% reclaim vp U U U 294 359 # 295 # 360 vop_reclaim { 296 #% mkdir dvp L L L 361 IN struct vnode *vp; 297 #% mkdir vpp - L - 362 IN struct thread *td; 298 # 363 }; 299 vop_mkdir { 364 300 IN struct vnode *dvp; 365 # 301 OUT struct vnode **vpp; 366 #lock vp ? ? ? 302 IN struct componentname *cnp; 367 #! lock pre vop_lock_pre 303 IN struct vattr *vap; 368 #! lock post vop_lock_post 304 }; 369 # 305 370 vop_lock { 306 # 371 IN struct vnode *vp; 307 #% rmdir dvp L L L 372 IN int flags; 308 #% rmdir vp L L L 373 IN struct thread *td; 309 # 374 }; 310 vop_rmdir { 375 311 IN struct vnode *dvp; 376 # 312 IN struct vnode *vp; 377 #unlock vp L ? L 313 IN struct componentname *cnp; 378 #! unlock pre vop_unlock_pre 314 }; 379 #! unlock post vop_unlock_post 315 380 # 316 # 381 vop_unlock { 317 #% symlink dvp L L L 382 IN struct vnode *vp; 318 #% symlink vpp - L - 383 IN int flags; 319 # 384 IN struct thread *td; 320 vop_symlink { 385 }; 321 IN struct vnode *dvp; 386 322 OUT struct vnode **vpp; 387 # 323 IN struct componentname *cnp; 388 #% bmap vp L L L 324 IN struct vattr *vap; 389 #% bmap vpp - U - 325 IN char *target; 390 # 07/28/03 11:53:28 sys/kern/vnode_if.src 4 391 vop_bmap { 456 vop_reallocblks { 392 IN struct vnode *vp; 457 IN struct vnode *vp; 393 IN daddr_t bn; 458 IN struct cluster_save *buflist; 394 OUT struct vnode **vpp; 459 }; 395 IN daddr_t *bnp; 460 396 OUT int *runp; 461 # 397 OUT int *runb; 462 #% getpages vp L L L 398 }; 463 # 399 464 vop_getpages { 400 # 465 IN struct vnode *vp; 401 # strategy vp L L L 466 IN vm_page_t *m; 402 #! strategy pre vop_strategy_pre 467 IN int count; 403 # 468 IN int reqpage; 404 vop_strategy { 469 IN vm_ooffset_t offset; 405 IN struct vnode *vp; 470 }; 406 IN struct buf *bp; 471 407 }; 472 # 408 473 #% putpages vp L L L 409 # 474 # 410 # specstrategy vp L L L 475 vop_putpages { 411 #! specstrategy pre vop_strategy_pre 476 IN struct vnode *vp; 412 # 477 IN vm_page_t *m; 413 vop_specstrategy { 478 IN int count; 414 IN struct vnode *vp; 479 IN int sync; 415 IN struct buf *bp; 480 IN int *rtvals; 416 }; 481 IN vm_ooffset_t offset; 417 482 }; 418 # 483 419 #% getwritemount vp = = = 484 # 420 # 485 #% freeblks vp - - - 421 vop_getwritemount { 486 # 422 IN struct vnode *vp; 487 # This call is used by the filesystem to release blocks back to 423 OUT struct mount **mpp; 488 # device-driver. This is useful if the driver has a lengthy 424 }; 489 # erase handling or similar. 425 490 # 426 # 491 427 #% print vp = = = 492 vop_freeblks { 428 # 493 IN struct vnode *vp; 429 vop_print { 494 IN daddr_t addr; 430 IN struct vnode *vp; 495 IN daddr_t length; 431 }; 496 }; 432 497 433 # 498 # 434 #% pathconf vp L L L 499 #% getacl vp L L L 435 # 500 # 436 vop_pathconf { 501 vop_getacl { 437 IN struct vnode *vp; 502 IN struct vnode *vp; 438 IN int name; 503 IN acl_type_t type; 439 OUT register_t *retval; 504 OUT struct acl *aclp; 440 }; 505 IN struct ucred *cred; 441 506 IN struct thread *td; 442 # 507 }; 443 #% advlock vp U U U 508 444 # 509 # 445 vop_advlock { 510 #% setacl vp L L L 446 IN struct vnode *vp; 511 # 447 IN caddr_t id; 512 vop_setacl { 448 IN int op; 513 IN struct vnode *vp; 449 IN struct flock *fl; 514 IN acl_type_t type; 450 IN int flags; 515 IN struct acl *aclp; 451 }; 516 IN struct ucred *cred; 452 517 IN struct thread *td; 453 # 518 }; 454 #% reallocblks vp L L L 519 455 # 520 # 07/28/03 11:53:28 sys/kern/vnode_if.src 5 521 #% aclcheck vp = = = 586 # 522 # 587 #% setextattr vp L L L 523 vop_aclcheck { 588 # 524 IN struct vnode *vp; 589 vop_setextattr { 525 IN acl_type_t type; 590 IN struct vnode *vp; 526 IN struct acl *aclp; 591 IN int attrnamespace; 527 IN struct ucred *cred; 592 IN const char *name; 528 IN struct thread *td; 593 INOUT struct uio *uio; 529 }; 594 IN struct ucred *cred; 530 595 IN struct thread *td; 531 # 596 }; 532 #% closeextattr vp L L L 597 533 # 598 # 534 vop_closeextattr { 599 #% createvobject vp L L L 535 IN struct vnode *vp; 600 # 536 IN int commit; 601 vop_createvobject { 537 IN struct ucred *cred; 602 IN struct vnode *vp; 538 IN struct thread *td; 603 IN struct ucred *cred; 539 }; 604 IN struct thread *td; 540 605 }; 541 # 606 542 #% getextattr vp L L L 607 # 543 # 608 #% destroyvobject vp L L L 544 vop_getextattr { 609 # 545 IN struct vnode *vp; 610 vop_destroyvobject { 546 IN int attrnamespace; 611 IN struct vnode *vp; 547 IN const char *name; 612 }; 548 INOUT struct uio *uio; 613 549 OUT size_t *size; 614 # 550 IN struct ucred *cred; 615 #% getvobject vp L L L 551 IN struct thread *td; 616 # 552 }; 617 vop_getvobject { 553 618 IN struct vnode *vp; 554 # 619 OUT struct vm_object **objpp; 555 #% listextattr vp L L L 620 }; 556 # 621 557 vop_listextattr { 622 # 558 IN struct vnode *vp; 623 #% setlabel vp L L L 559 IN int attrnamespace; 624 # 560 INOUT struct uio *uio; 625 vop_setlabel { 561 OUT size_t *size; 626 IN struct vnode *vp; 562 IN struct ucred *cred; 627 IN struct label *label; 563 IN struct thread *td; 628 IN struct ucred *cred; 564 }; 629 IN struct thread *td; 565 630 }; 566 # 567 #% openextattr vp L L L 568 # 569 vop_openextattr { 570 IN struct vnode *vp; 571 IN struct ucred *cred; 572 IN struct thread *td; 573 }; 574 575 # 576 #% deleteextattr vp L L L 577 # 578 vop_deleteextattr { 579 IN struct vnode *vp; 580 IN int attrnamespace; 581 IN const char *name; 582 IN struct ucred *cred; 583 IN struct thread *td; 584 }; 585 09/24/03 18:10:25 sys/kern/imgact_elf.c 1 1 /*- 303 rv = vm_map_find(map, 0, 0, &start, end - start, 2 * Copyright (c) 2000 David O’Brien 304 FALSE, prot, max, 0); 3 * Copyright (c) 1995-1996 Søren Schmidt 305 if (rv) 4 * Copyright (c) 1996 Peter Wemm 306 return (rv); 5 * All rights reserved. 307 data_buf = 0; 6 * 308 while (start < end) { 7 * Redistribution and use in source and binary forms, with or without 309 vm_object_reference(object); 8 * modification, are permitted provided that the following conditions 310 rv = vm_map_find(exec_map, 9 * are met: 311 object, 10 * 1. Redistributions of source code must retain the above copyright 312 trunc_page(offset), 11 * notice, this list of conditions and the following disclaimer 313 &data_buf, 12 * in this position and unchanged. 314 2 * PAGE_SIZE, 13 * 2. Redistributions in binary form must reproduce the above copyright 315 TRUE, 14 * notice, this list of conditions and the following disclaimer in the 316 VM_PROT_READ, 15 * documentation and/or other materials provided with the distribution. 317 VM_PROT_ALL, 16 * 3. The name of the author may not be used to endorse or promote products 318 (MAP_COPY_ON_WRITE 17 * derived from this software without specific prior written permission 319 | MAP_PREFAULT_PARTIAL)); 18 * 320 if (rv != KERN_SUCCESS) { 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ‘‘AS IS’’ AND ANY EXPRESS OR 321 vm_object_deallocate(object); 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 322 return (rv); 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 323 } 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 324 off = offset - trunc_page(offset); 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 325 sz = end - start; 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 326 if (sz > PAGE_SIZE) 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 327 sz = PAGE_SIZE; 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 328 error = copyout((caddr_t)data_buf + off, 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 329 (caddr_t)start, sz); 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 330 vm_map_remove(exec_map, data_buf, 29 */ 331 data_buf + 2 * PAGE_SIZE); 30 332 if (error) { 31 333 return (KERN_FAILURE); 32 __FBSDID("$FreeBSD: src/sys/kern/imgact_elf.c,v 1.141 2003/09/25 01:10:25 pete 334 } r Exp $"); 335 start += sz; 336 } 337 rv = KERN_SUCCESS; 273 static int 338 } else { 274 __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 339 vm_map_lock(map); 275 vm_offset_t start, vm_offset_t end, vm_prot_t prot, 340 rv = vm_map_insert(map, object, offset, start, end, 276 vm_prot_t max, int cow) 341 prot, max, cow); 277 { 342 vm_map_unlock(map); 278 vm_offset_t data_buf, off; 343 } 279 vm_size_t sz; 344 return (rv); 280 int error, rv; 345 } else { 281 346 return (KERN_SUCCESS); 282 if (start != trunc_page(start)) { 347 } 283 rv = __elfN(map_partial)(map, object, offset, start, 348 } 284 round_page(start), prot, max); 349 285 if (rv) 350 static int 286 return (rv); 351 __elfN(load_section)(struct proc *p, struct vmspace *vmspace, 287 offset += round_page(start) - start; 352 struct vnode *vp, vm_object_t object, vm_offset_t offset, 288 start = round_page(start); 353 caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, 289 } 354 size_t pagesize) 290 if (end != round_page(end)) { 355 { 291 rv = __elfN(map_partial)(map, object, offset + 356 size_t map_len; 292 trunc_page(end) - start, trunc_page(end), end, prot, max); 357 vm_offset_t map_addr; 293 if (rv) 358 int error, rv, cow; 294 return (rv); 359 size_t copy_len; 295 end = trunc_page(end); 360 vm_offset_t file_addr; 296 } 361 vm_offset_t data_buf = 0; 297 if (end > start) { 362 298 if (offset & PAGE_MASK) { 363 GIANT_REQUIRED; 299 /* 364 300 * The mapping is not page aligned. This means we have 365 error = 0; 301 * to copy the data. Sigh. 366 302 */ 367 /* 09/24/03 18:10:25 sys/kern/imgact_elf.c 2 368 * It’s necessary to fail if the filsz + offset taken from the 433 map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize); 369 * header is greater than the actual file pager object’s size. 434 map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) - 370 * If we were to allow this, then the vm_map_find() below would 435 map_addr; 371 * walk right off the end of the file object and into the ether. 436 372 * 437 /* This had damn well better be true! */ 373 * While I’m here, might as well check for something else that 438 if (map_len != 0) { 374 * is invalid: filsz cannot be greater than memsz. 439 rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr, 375 */ 440 map_addr + map_len, VM_PROT_ALL, VM_PROT_ALL, 0); 376 if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size || 441 if (rv != KERN_SUCCESS) { 377 filsz > memsz) { 442 return (EINVAL); 378 uprintf("elf_load_section: truncated ELF file\n"); 443 } 379 return (ENOEXEC); 444 } 380 } 445 381 446 if (copy_len != 0) { 382 #define trunc_page_ps(va, ps) ((va) & ˜(ps - 1)) 447 vm_offset_t off; 383 #define round_page_ps(va, ps) (((va) + (ps - 1)) & ˜(ps - 1)) 448 vm_object_reference(object); 384 449 rv = vm_map_find(exec_map, 385 map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize); 450 object, 386 file_addr = trunc_page_ps(offset, pagesize); 451 trunc_page(offset + filsz), 387 452 &data_buf, 388 /* 453 PAGE_SIZE, 389 * We have two choices. We can either clear the data in the last page 454 TRUE, 390 * of an oversized mapping, or we can start the anon mapping a page 455 VM_PROT_READ, 391 * early and copy the initialized data into that first page. We 456 VM_PROT_ALL, 392 * choose the second.. 457 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL); 393 */ 458 if (rv != KERN_SUCCESS) { 394 if (memsz > filsz) 459 vm_object_deallocate(object); 395 map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr; 460 return (EINVAL); 396 else 461 } 397 map_len = round_page_ps(offset + filsz, pagesize) - file_addr; 462 398 463 /* send the page fragment to user space */ 399 if (map_len != 0) { 464 off = trunc_page_ps(offset + filsz, pagesize) - 400 vm_object_reference(object); 465 trunc_page(offset + filsz); 401 466 error = copyout((caddr_t)data_buf + off, (caddr_t)map_addr, 402 /* cow flags: don’t dump readonly sections in core */ 467 copy_len); 403 cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | 468 vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE); 404 (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); 469 if (error) { 405 470 return (error); 406 rv = __elfN(map_insert)(&vmspace->vm_map, 471 } 407 object, 472 } 408 file_addr, /* file offset */ 473 409 map_addr, /* virtual start */ 474 /* 410 map_addr + map_len,/* virtual end */ 475 * set it to the specified protection. 411 prot, 476 * XXX had better undo the damage from pasting over the cracks here! 412 VM_PROT_ALL, 477 */ 413 cow); 478 vm_map_protect(&vmspace->vm_map, trunc_page(map_addr), 414 if (rv != KERN_SUCCESS) { 479 round_page(map_addr + map_len), prot, FALSE); 415 vm_object_deallocate(object); 480 416 return (EINVAL); 481 return (error); 417 } 482 } 418 419 /* we can stop now if we’ve covered it all */ 420 if (memsz == filsz) { 643 static int 421 return (0); 644 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) 422 } 645 { 423 } 646 const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; 424 647 const Elf_Phdr *phdr; 425 648 Elf_Auxargs *elf_auxargs = NULL; 426 /* 649 struct vmspace *vmspace; 427 * We have to get the remaining bit of the file into the first part 650 vm_prot_t prot; 428 * of the oversized map segment. This is normally because the .data 651 u_long text_size = 0, data_size = 0, total_size = 0; 429 * segment in the file is extended to provide bss. It’s a neat idea 652 u_long text_addr = 0, data_addr = 0; 430 * to try and save a page, but it’s a pain in the behind to implement. 653 u_long seg_size, seg_addr; 431 */ 654 u_long addr, entry = 0, proghdr = 0; 432 copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize); 655 int error, i; 09/24/03 18:10:25 sys/kern/imgact_elf.c 3 656 const char *interp = NULL; 721 case PT_LOAD: /* Loadable segment */ 657 Elf_Brandinfo *brand_info; 722 prot = 0; 658 char *path; 723 if (phdr[i].p_flags & PF_X) 659 struct thread *td = curthread; 724 prot |= VM_PROT_EXECUTE; 660 struct sysentvec *sv; 725 if (phdr[i].p_flags & PF_W) 661 726 prot |= VM_PROT_WRITE; 662 GIANT_REQUIRED; 727 if (phdr[i].p_flags & PF_R) 663 728 prot |= VM_PROT_READ; 664 /* 729 665 * Do we have a valid ELF header ? 730 #if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER) 666 */ 731 /* 667 if (__elfN(check_header)(hdr) != 0 || hdr->e_type != ET_EXEC) 732 * Some x86 binaries assume read == executable, 668 return (-1); 733 * notably the M3 runtime and therefore cvsup 669 734 */ 670 /* 735 if (prot & VM_PROT_READ) 671 * From here on down, we return an errno, not -1, as we’ve 736 prot |= VM_PROT_EXECUTE; 672 * detected an ELF file. 737 #endif 673 */ 738 674 739 if ((error = __elfN(load_section)(imgp->proc, vmspace, 675 if ((hdr->e_phoff > PAGE_SIZE) || 740 imgp->vp, imgp->object, phdr[i].p_offset, 676 (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { 741 (caddr_t)(uintptr_t)phdr[i].p_vaddr, 677 /* Only support headers in first page for now */ 742 phdr[i].p_memsz, phdr[i].p_filesz, prot, 678 return (ENOEXEC); 743 sv->sv_pagesize)) != 0) 679 } 744 goto fail; 680 phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); 745 681 746 seg_addr = trunc_page(phdr[i].p_vaddr); 682 /* 747 seg_size = round_page(phdr[i].p_memsz + 683 * From this point on, we may have resources that need to be freed. 748 phdr[i].p_vaddr - seg_addr); 684 */ 749 685 750 /* 686 VOP_UNLOCK(imgp->vp, 0, td); 751 * Is this .text or .data? We can’t use 687 752 * VM_PROT_WRITE or VM_PROT_EXEC, it breaks the 688 for (i = 0; i < hdr->e_phnum; i++) { 753 * alpha terribly and possibly does other bad 689 switch (phdr[i].p_type) { 754 * things so we stick to the old way of figuring 690 case PT_INTERP: /* Path to interpreter */ 755 * it out: If the segment contains the program 691 if (phdr[i].p_filesz > MAXPATHLEN || 756 * entry point, it’s a text segment, otherwise it 692 phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) { 757 * is a data segment. 693 error = ENOEXEC; 758 * 694 goto fail; 759 * Note that obreak() assumes that data_addr + 695 } 760 * data_size == end of data load area, and the ELF 696 interp = imgp->image_header + phdr[i].p_offset; 761 * file format expects segments to be sorted by 697 break; 762 * address. If multiple data segments exist, the 698 default: 763 * last one will be used. 699 break; 764 */ 700 } 765 if (hdr->e_entry >= phdr[i].p_vaddr && 701 } 766 hdr->e_entry < (phdr[i].p_vaddr + 702 767 phdr[i].p_memsz)) { 703 brand_info = __elfN(get_brandinfo)(hdr, interp); 768 text_size = seg_size; 704 if (brand_info == NULL) { 769 text_addr = seg_addr; 705 uprintf("ELF binary type \"%u\" not known.\n", 770 entry = (u_long)hdr->e_entry; 706 hdr->e_ident[EI_OSABI]); 771 } else { 707 error = ENOEXEC; 772 data_size = seg_size; 708 goto fail; 773 data_addr = seg_addr; 709 } 774 } 710 sv = brand_info->sysvec; 775 total_size += seg_size; 711 776 break; 712 if ((error = exec_extract_strings(imgp)) != 0) 777 case PT_PHDR: /* Program header table info */ 713 goto fail; 778 proghdr = phdr[i].p_vaddr; 714 779 break; 715 exec_new_vmspace(imgp, sv); 780 default: 716 781 break; 717 vmspace = imgp->proc->p_vmspace; 782 } 718 783 } 719 for (i = 0; i < hdr->e_phnum; i++) { 784 720 switch (phdr[i].p_type) { 785 if (data_addr == 0 && data_size == 0) { 09/24/03 18:10:25 sys/kern/imgact_elf.c 4 786 data_addr = text_addr; 850 elf_auxargs->trace = elf_trace; 787 data_size = text_size; 851 788 } 852 imgp->auxargs = elf_auxargs; 789 853 imgp->interpreted = 0; 790 /* 854 791 * Check limits. It should be safe to check the 855 fail: 792 * limits after loading the segments since we do 856 vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); 793 * not actually fault in all the segments pages. 857 return (error); 794 */ 858 } 795 if (data_size > 796 imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur || 797 text_size > maxtsiz || 798 total_size > 799 imgp->proc->p_rlimit[RLIMIT_VMEM].rlim_cur) { 800 error = ENOMEM; 801 goto fail; 802 } 803 804 vmspace->vm_tsize = text_size >> PAGE_SHIFT; 805 vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; 806 vmspace->vm_dsize = data_size >> PAGE_SHIFT; 807 vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; 808 809 /* 810 * We load the dynamic linker where a userland call 811 * to mmap(0, ...) would put it. The rationale behind this 812 * calculation is that it leaves room for the heap to grow to 813 * its maximum allowed size. 814 */ 815 addr = round_page((vm_offset_t)imgp->proc->p_vmspace->vm_daddr + 816 imgp->proc->p_rlimit[RLIMIT_DATA].rlim_max); 817 818 imgp->entry_addr = entry; 819 820 imgp->proc->p_sysent = sv; 821 if (interp != NULL) { 822 path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 823 snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path, 824 interp); 825 if ((error = __elfN(load_file)(imgp->proc, path, &addr, 826 &imgp->entry_addr, sv->sv_pagesize)) != 0) { 827 if ((error = __elfN(load_file)(imgp->proc, interp, 828 &addr, &imgp->entry_addr, sv->sv_pagesize)) != 0) { 829 uprintf("ELF interpreter %s not found\n", 830 path); 831 free(path, M_TEMP); 832 goto fail; 833 } 834 } 835 free(path, M_TEMP); 836 } 837 838 /* 839 * Construct auxargs table (used by the fixup routine) 840 */ 841 elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); 842 elf_auxargs->execfd = -1; 843 elf_auxargs->phdr = proghdr; 844 elf_auxargs->phent = hdr->e_phentsize; 845 elf_auxargs->phnum = hdr->e_phnum; 846 elf_auxargs->pagesz = PAGE_SIZE; 847 elf_auxargs->base = addr; 848 elf_auxargs->flags = 0; 849 elf_auxargs->entry = entry; 07/27/03 10:04:55 sys/kern/kern_acct.c 1 1 /*- 232 mtx_lock(&acct_mtx); 2 * Copyright (c) 1994 Christopher G. Demetriou 233 3 * Copyright (c) 1982, 1986, 1989, 1993 234 /* If accounting isn’t enabled, don’t bother */ 4 * The Regents of the University of California. All rights reserved. 235 vp = acctp; 5 * (c) UNIX System Laboratories, Inc. 236 if (vp == NULLVP) { 6 * All or some portions of this file are derived from material licensed 237 mtx_unlock(&acct_mtx); 7 * to the University of California by American Telephone and Telegraph 238 return (0); 8 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 239 } 9 * the permission of UNIX System Laboratories, Inc. 240 10 * 241 /* 11 * Redistribution and use in source and binary forms, with or without 242 * Get process accounting information. 12 * modification, are permitted provided that the following conditions 243 */ 13 * are met: 244 14 * 1. Redistributions of source code must retain the above copyright 245 PROC_LOCK(p); 15 * notice, this list of conditions and the following disclaimer. 246 /* (1) The name of the command that ran */ 16 * 2. Redistributions in binary form must reproduce the above copyright 247 bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); 17 * notice, this list of conditions and the following disclaimer in the 248 18 * documentation and/or other materials provided with the distribution. 249 /* (2) The amount of user and system time that was used */ 19 * 3. All advertising materials mentioning features or use of this software 250 mtx_lock_spin(&sched_lock); 20 * must display the following acknowledgement: 251 calcru(p, &ut, &st, NULL); 21 * This product includes software developed by the University of 252 mtx_unlock_spin(&sched_lock); 22 * California, Berkeley and its contributors. 253 acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec); 23 * 4. Neither the name of the University nor the names of its contributors 254 acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec); 24 * may be used to endorse or promote products derived from this software 255 25 * without specific prior written permission. 256 /* (3) The elapsed time the command ran (and its starting time) */ 26 * 257 tmp = boottime; 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 258 timevaladd(&tmp, &p->p_stats->p_start); 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 259 acct.ac_btime = tmp.tv_sec; 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 260 microuptime(&tmp); 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 261 timevalsub(&tmp, &p->p_stats->p_start); 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 262 acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec); 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 263 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 264 /* (4) The average amount of memory used */ 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 265 r = &p->p_stats->p_ru; 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 266 tmp = ut; 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 267 timevaladd(&tmp, &st); 37 * SUCH DAMAGE. 268 t = tmp.tv_sec * hz + tmp.tv_usec / tick; 38 * 269 if (t) 39 * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93 270 acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t; 40 */ 271 else 41 272 acct.ac_mem = 0; 42 #include 273 43 __FBSDID("$FreeBSD: src/sys/kern/kern_acct.c,v 1.68 2003/07/27 17:04:55 phk Ex 274 /* (5) The number of disk I/O operations done */ p $"); 275 acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0); 276 277 /* (6) The UID and GID of the process */ 213 /* 278 acct.ac_uid = p->p_ucred->cr_ruid; 214 * Write out process accounting information, on process exit. 279 acct.ac_gid = p->p_ucred->cr_rgid; 215 * Data to be written out is specified in Leffler, et al. 280 216 * and are enumerated below. (They’re also noted in the system 281 /* (7) The terminal from which the process was started */ 217 * "acct.h" header file.) 282 SESS_LOCK(p->p_session); 218 */ 283 if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) 219 284 acct.ac_tty = dev2udev(p->p_pgrp->pg_session->s_ttyp->t_dev); 220 int 285 else 221 acct_process(td) 286 acct.ac_tty = NOUDEV; 222 struct thread *td; 287 SESS_UNLOCK(p->p_session); 223 { 288 224 struct proc *p = td->td_proc; 289 /* (8) The boolean flags that tell how the process terminated, etc. */ 225 struct acct acct; 290 acct.ac_flag = p->p_acflag; 226 struct rusage *r; 291 PROC_UNLOCK(p); 227 struct timeval ut, st, tmp; 292 228 int t, ret; 293 /* 229 struct vnode *vp; 294 * Write the accounting information to the file. 230 struct ucred *uc; 295 */ 231 296 uc = crhold(acctcred); 07/27/03 10:04:55 sys/kern/kern_acct.c 2 297 vref(vp); 298 mtx_unlock(&acct_mtx); 299 300 /* 301 * Eliminate any file size rlimit. 302 */ 303 if (p->p_limit->p_refcnt > 1) { 304 p->p_limit->p_refcnt--; 305 p->p_limit = limcopy(p->p_limit); 306 } 307 p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 308 309 VOP_LEASE(vp, td, uc, LEASE_WRITE); 310 ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct), 311 (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, uc, NOCRED, 312 (int *)0, td); 313 vrele(vp); 314 crfree(uc); 315 return (ret); 316 } 12/30/03 12:13:19 sys/kern/kern_descrip.c 1 1 /* 65 #include 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 66 #include 3 * The Regents of the University of California. All rights reserved. 67 #include 4 * (c) UNIX System Laboratories, Inc. 68 #include 5 * All or some portions of this file are derived from material licensed 69 #include 6 * to the University of California by American Telephone and Telegraph 70 #include 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 71 #include 8 * the permission of UNIX System Laboratories, Inc. 72 9 * 73 #include 10 * Redistribution and use in source and binary forms, with or without 74 #include 11 * modification, are permitted provided that the following conditions 75 #include 12 * are met: 76 13 * 1. Redistributions of source code must retain the above copyright 77 static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); 14 * notice, this list of conditions and the following disclaimer. 78 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader", 15 * 2. Redistributions in binary form must reproduce the above copyright 79 "file desc to leader structures"); 16 * notice, this list of conditions and the following disclaimer in the 80 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); 17 * documentation and/or other materials provided with the distribution. 81 18 * 3. All advertising materials mentioning features or use of this software 82 static uma_zone_t file_zone; 19 * must display the following acknowledgement: 83 20 * This product includes software developed by the University of 84 static d_open_t fdopen; 21 * California, Berkeley and its contributors. 85 #define NUMFDESC 64 22 * 4. Neither the name of the University nor the names of its contributors 86 23 * may be used to endorse or promote products derived from this software 87 #define CDEV_MAJOR 22 24 * without specific prior written permission. 88 static struct cdevsw fildesc_cdevsw = { 25 * 89 .d_open = fdopen, 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 90 .d_name = "FD", 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 91 .d_maj = CDEV_MAJOR, 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 92 }; 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 93 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 94 /* How to treat ’new’ parameter when allocating a fd for do_dup(). */ 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 95 enum dup_type { DUP_VARIABLE, DUP_FIXED }; 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 96 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 97 static int do_dup(struct thread *td, enum dup_type type, int old, int new, 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 98 register_t *retval); 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 99 36 * SUCH DAMAGE. 100 /* 37 * 101 * Descriptor management. 38 * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 102 */ 39 */ 103 struct filelist filehead; /* head of list of open files */ 40 104 int nfiles; /* actual number of open files */ 41 #include 105 struct sx filelist_lock; /* sx to protect filelist */ 42 __FBSDID("$FreeBSD: src/sys/kern/kern_descrip.c,v 1.215.2.1 2003/12/30 20:13:1 106 struct mtx sigio_lock; /* mtx to protect pointers to sigio */ 9 dwmalone Exp $"); 107 43 108 /* 44 #include "opt_compat.h" 109 * System calls on descriptors. 45 110 */ 46 #include 111 #ifndef _SYS_SYSPROTO_H_ 47 #include 112 struct getdtablesize_args { 48 #include 113 int dummy; 49 #include 114 }; 50 #include 115 #endif 51 #include 116 /* 52 #include 117 * MPSAFE 53 #include 118 */ 54 #include 119 /* ARGSUSED */ 55 #include 120 int 56 #include 121 getdtablesize(td, uap) 57 #include 122 struct thread *td; 58 #include 123 struct getdtablesize_args *uap; 59 #include 124 { 60 #include 125 struct proc *p = td->td_proc; 61 #include 126 62 #include 127 mtx_lock(&Giant); 63 #include 128 td->td_retval[0] = 64 #include 129 min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); 12/30/03 12:13:19 sys/kern/kern_descrip.c 2 130 mtx_unlock(&Giant); 195 int 131 return (0); 196 fcntl(td, uap) 132 } 197 struct thread *td; 133 198 struct fcntl_args *uap; 134 /* 199 { 135 * Duplicate a file descriptor to a particular value. 200 struct flock fl; 136 * 201 intptr_t arg; 137 * note: keep in mind that a potential race condition exists when closing 202 int error; 138 * descriptors from a shared descriptor table (via rfork). 203 139 */ 204 error = 0; 140 #ifndef _SYS_SYSPROTO_H_ 205 switch (uap->cmd) { 141 struct dup2_args { 206 case F_GETLK: 142 u_int from; 207 case F_SETLK: 143 u_int to; 208 case F_SETLKW: 144 }; 209 error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); 145 #endif 210 arg = (intptr_t)&fl; 146 /* 211 break; 147 * MPSAFE 212 default: 148 */ 213 arg = uap->arg; 149 /* ARGSUSED */ 214 break; 150 int 215 } 151 dup2(td, uap) 216 if (error) 152 struct thread *td; 217 return (error); 153 struct dup2_args *uap; 218 error = kern_fcntl(td, uap->fd, uap->cmd, arg); 154 { 219 if (error) 155 220 return (error); 156 return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, 221 if (uap->cmd == F_GETLK) 157 td->td_retval)); 222 error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); 158 } 223 return (error); 159 224 } 160 /* 225 161 * Duplicate a file descriptor. 226 int 162 */ 227 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) 163 #ifndef _SYS_SYSPROTO_H_ 228 { 164 struct dup_args { 229 struct filedesc *fdp; 165 u_int fd; 230 struct flock *flp; 166 }; 231 struct file *fp; 167 #endif 232 struct proc *p; 168 /* 233 char *pop; 169 * MPSAFE 234 struct vnode *vp; 170 */ 235 u_int newmin; 171 /* ARGSUSED */ 236 int error, flg, tmp; 172 int 237 173 dup(td, uap) 238 error = 0; 174 struct thread *td; 239 flg = F_POSIX; 175 struct dup_args *uap; 240 p = td->td_proc; 176 { 241 fdp = p->p_fd; 177 242 mtx_lock(&Giant); 178 return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval)); 243 FILEDESC_LOCK(fdp); 179 } 244 if ((unsigned)fd >= fdp->fd_nfiles || 180 245 (fp = fdp->fd_ofiles[fd]) == NULL) { 181 /* 246 FILEDESC_UNLOCK(fdp); 182 * The file control system call. 247 error = EBADF; 183 */ 248 goto done2; 184 #ifndef _SYS_SYSPROTO_H_ 249 } 185 struct fcntl_args { 250 pop = &fdp->fd_ofileflags[fd]; 186 int fd; 251 187 int cmd; 252 switch (cmd) { 188 long arg; 253 case F_DUPFD: 189 }; 254 FILEDESC_UNLOCK(fdp); 190 #endif 255 newmin = arg; 191 /* 256 if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || 192 * MPSAFE 257 newmin >= maxfilesperproc) { 193 */ 258 error = EINVAL; 194 /* ARGSUSED */ 259 break; 12/30/03 12:13:19 sys/kern/kern_descrip.c 3 260 } 325 261 error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval); 326 case F_SETLKW: 262 break; 327 flg |= F_WAIT; 263 328 /* FALLTHROUGH F_SETLK */ 264 case F_GETFD: 329 265 td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; 330 case F_SETLK: 266 FILEDESC_UNLOCK(fdp); 331 if (fp->f_type != DTYPE_VNODE) { 267 break; 332 FILEDESC_UNLOCK(fdp); 268 333 error = EBADF; 269 case F_SETFD: 334 break; 270 *pop = (*pop &˜ UF_EXCLOSE) | 335 } 271 (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); 336 272 FILEDESC_UNLOCK(fdp); 337 flp = (struct flock *)arg; 273 break; 338 if (flp->l_whence == SEEK_CUR) { 274 339 if (fp->f_offset < 0 || 275 case F_GETFL: 340 (flp->l_start > 0 && 276 FILE_LOCK(fp); 341 fp->f_offset > OFF_MAX - flp->l_start)) { 277 FILEDESC_UNLOCK(fdp); 342 FILEDESC_UNLOCK(fdp); 278 td->td_retval[0] = OFLAGS(fp->f_flag); 343 error = EOVERFLOW; 279 FILE_UNLOCK(fp); 344 break; 280 break; 345 } 281 346 flp->l_start += fp->f_offset; 282 case F_SETFL: 347 } 283 FILE_LOCK(fp); 348 284 FILEDESC_UNLOCK(fdp); 349 /* 285 fhold_locked(fp); 350 * VOP_ADVLOCK() may block. 286 fp->f_flag &= ˜FCNTLFLAGS; 351 */ 287 fp->f_flag |= FFLAGS(arg & ˜O_ACCMODE) & FCNTLFLAGS; 352 fhold(fp); 288 FILE_UNLOCK(fp); 353 FILEDESC_UNLOCK(fdp); 289 tmp = fp->f_flag & FNONBLOCK; 354 vp = fp->f_vnode; 290 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 355 291 if (error) { 356 switch (flp->l_type) { 292 fdrop(fp, td); 357 case F_RDLCK: 293 break; 358 if ((fp->f_flag & FREAD) == 0) { 294 } 359 error = EBADF; 295 tmp = fp->f_flag & FASYNC; 360 break; 296 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); 361 } 297 if (error == 0) { 362 PROC_LOCK(p->p_leader); 298 fdrop(fp, td); 363 p->p_leader->p_flag |= P_ADVLOCK; 299 break; 364 PROC_UNLOCK(p->p_leader); 300 } 365 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 301 FILE_LOCK(fp); 366 flp, flg); 302 fp->f_flag &= ˜FNONBLOCK; 367 break; 303 FILE_UNLOCK(fp); 368 case F_WRLCK: 304 tmp = 0; 369 if ((fp->f_flag & FWRITE) == 0) { 305 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); 370 error = EBADF; 306 fdrop(fp, td); 371 break; 307 break; 372 } 308 373 PROC_LOCK(p->p_leader); 309 case F_GETOWN: 374 p->p_leader->p_flag |= P_ADVLOCK; 310 fhold(fp); 375 PROC_UNLOCK(p->p_leader); 311 FILEDESC_UNLOCK(fdp); 376 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, 312 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); 377 flp, flg); 313 if (error == 0) 378 break; 314 td->td_retval[0] = tmp; 379 case F_UNLCK: 315 fdrop(fp, td); 380 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, 316 break; 381 flp, F_POSIX); 317 382 break; 318 case F_SETOWN: 383 default: 319 fhold(fp); 384 error = EINVAL; 320 FILEDESC_UNLOCK(fdp); 385 break; 321 tmp = arg; 386 } 322 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); 387 /* Check for race with close */ 323 fdrop(fp, td); 388 FILEDESC_LOCK(fdp); 324 break; 389 if ((unsigned) fd >= fdp->fd_nfiles || 12/30/03 12:13:19 sys/kern/kern_descrip.c 4 390 fp != fdp->fd_ofiles[fd]) { 455 struct thread *td; 391 FILEDESC_UNLOCK(fdp); 456 { 392 flp->l_whence = SEEK_SET; 457 struct filedesc *fdp; 393 flp->l_start = 0; 458 struct proc *p; 394 flp->l_len = 0; 459 struct file *fp; 395 flp->l_type = F_UNLCK; 460 struct file *delfp; 396 (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, 461 int error, newfd; 397 F_UNLCK, flp, F_POSIX); 462 int holdleaders; 398 } else 463 399 FILEDESC_UNLOCK(fdp); 464 p = td->td_proc; 400 fdrop(fp, td); 465 fdp = p->p_fd; 401 break; 466 402 467 /* 403 case F_GETLK: 468 * Verify we have a valid descriptor to dup from and possibly to 404 if (fp->f_type != DTYPE_VNODE) { 469 * dup to. 405 FILEDESC_UNLOCK(fdp); 470 */ 406 error = EBADF; 471 if (old < 0 || new < 0 || new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur | 407 break; | 408 } 472 new >= maxfilesperproc) 409 flp = (struct flock *)arg; 473 return (EBADF); 410 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && 474 FILEDESC_LOCK(fdp); 411 flp->l_type != F_UNLCK) { 475 if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { 412 FILEDESC_UNLOCK(fdp); 476 FILEDESC_UNLOCK(fdp); 413 error = EINVAL; 477 return (EBADF); 414 break; 478 } 415 } 479 if (type == DUP_FIXED && old == new) { 416 if (flp->l_whence == SEEK_CUR) { 480 *retval = new; 417 if ((flp->l_start > 0 && 481 FILEDESC_UNLOCK(fdp); 418 fp->f_offset > OFF_MAX - flp->l_start) || 482 return (0); 419 (flp->l_start < 0 && 483 } 420 fp->f_offset < OFF_MIN - flp->l_start)) { 484 fp = fdp->fd_ofiles[old]; 421 FILEDESC_UNLOCK(fdp); 485 fhold(fp); 422 error = EOVERFLOW; 486 423 break; 487 /* 424 } 488 * Expand the table for the new descriptor if needed. This may 425 flp->l_start += fp->f_offset; 489 * block and drop and reacquire the filedesc lock. 426 } 490 */ 427 /* 491 if (type == DUP_VARIABLE || new >= fdp->fd_nfiles) { 428 * VOP_ADVLOCK() may block. 492 error = fdalloc(td, new, &newfd); 429 */ 493 if (error) { 430 fhold(fp); 494 FILEDESC_UNLOCK(fdp); 431 FILEDESC_UNLOCK(fdp); 495 fdrop(fp, td); 432 vp = fp->f_vnode; 496 return (error); 433 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, 497 } 434 F_POSIX); 498 } 435 fdrop(fp, td); 499 if (type == DUP_VARIABLE) 436 break; 500 new = newfd; 437 default: 501 438 FILEDESC_UNLOCK(fdp); 502 /* 439 error = EINVAL; 503 * If the old file changed out from under us then treat it as a 440 break; 504 * bad file descriptor. Userland should do its own locking to 441 } 505 * avoid this case. 442 done2: 506 */ 443 mtx_unlock(&Giant); 507 if (fdp->fd_ofiles[old] != fp) { 444 return (error); 508 if (fdp->fd_ofiles[new] == NULL) { 445 } 509 if (new < fdp->fd_freefile) 446 510 fdp->fd_freefile = new; 447 /* 511 while (fdp->fd_lastfile > 0 && 448 * Common code for dup, dup2, and fcntl(F_DUPFD). 512 fdp->fd_ofiles[fdp->fd_lastfile] == NULL) 449 */ 513 fdp->fd_lastfile--; 450 static int 514 } 451 do_dup(td, type, old, new, retval) 515 FILEDESC_UNLOCK(fdp); 452 enum dup_type type; 516 fdrop(fp, td); 453 int old, new; 517 return (EBADF); 454 register_t *retval; 518 } 12/30/03 12:13:19 sys/kern/kern_descrip.c 5 519 KASSERT(old != new, ("new fd is same as old")); 584 { 520 585 struct sigio *sigio; 521 /* 586 522 * Save info on the descriptor being overwritten. We have 587 SIGIO_LOCK(); 523 * to do the unmap now, but we cannot close it without 588 sigio = *sigiop; 524 * introducing an ownership race for the slot. 589 if (sigio == NULL) { 525 */ 590 SIGIO_UNLOCK(); 526 delfp = fdp->fd_ofiles[new]; 591 return; 527 if (delfp != NULL && p->p_fdtol != NULL) { 592 } 528 /* 593 *(sigio->sio_myref) = NULL; 529 * Ask fdfree() to sleep to ensure that all relevant 594 if ((sigio)->sio_pgid < 0) { 530 * process leaders can be traversed in closef(). 595 struct pgrp *pg = (sigio)->sio_pgrp; 531 */ 596 PGRP_LOCK(pg); 532 fdp->fd_holdleaderscount++; 597 SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, 533 holdleaders = 1; 598 sigio, sio_pgsigio); 534 } else 599 PGRP_UNLOCK(pg); 535 holdleaders = 0; 600 } else { 536 KASSERT(delfp == NULL || type == DUP_FIXED, 601 struct proc *p = (sigio)->sio_proc; 537 ("dup() picked an open file")); 602 PROC_LOCK(p); 538 #if 0 603 SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, 539 if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED)) 604 sigio, sio_pgsigio); 540 (void) munmapfd(td, new); 605 PROC_UNLOCK(p); 541 #endif 606 } 542 607 SIGIO_UNLOCK(); 543 /* 608 crfree(sigio->sio_ucred); 544 * Duplicate the source descriptor, update lastfile 609 FREE(sigio, M_SIGIO); 545 */ 610 } 546 fdp->fd_ofiles[new] = fp; 611 547 fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &˜ UF_EXCLOSE; 612 /* 548 if (new > fdp->fd_lastfile) 613 * Free a list of sigio structures. 549 fdp->fd_lastfile = new; 614 * We only need to lock the SIGIO_LOCK because we have made ourselves 550 FILEDESC_UNLOCK(fdp); 615 * inaccessable to callers of fsetown and therefore do not need to lock 551 *retval = new; 616 * the proc or pgrp struct for the list manipulation. 552 617 */ 553 /* 618 void 554 * If we dup’d over a valid file, we now own the reference to it 619 funsetownlst(sigiolst) 555 * and must dispose of it using closef() semantics (as if a 620 struct sigiolst *sigiolst; 556 * close() were performed on it). 621 { 557 */ 622 struct proc *p; 558 if (delfp) { 623 struct pgrp *pg; 559 mtx_lock(&Giant); 624 struct sigio *sigio; 560 (void) closef(delfp, td); 625 561 mtx_unlock(&Giant); 626 sigio = SLIST_FIRST(sigiolst); 562 if (holdleaders) { 627 if (sigio == NULL) 563 FILEDESC_LOCK(fdp); 628 return; 564 fdp->fd_holdleaderscount--; 629 p = NULL; 565 if (fdp->fd_holdleaderscount == 0 && 630 pg = NULL; 566 fdp->fd_holdleaderswakeup != 0) { 631 567 fdp->fd_holdleaderswakeup = 0; 632 /* 568 wakeup(&fdp->fd_holdleaderscount); 633 * Every entry of the list should belong 569 } 634 * to a single proc or pgrp. 570 FILEDESC_UNLOCK(fdp); 635 */ 571 } 636 if (sigio->sio_pgid < 0) { 572 } 637 pg = sigio->sio_pgrp; 573 return (0); 638 PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); 574 } 639 } else /* if (sigio->sio_pgid > 0) */ { 575 640 p = sigio->sio_proc; 576 /* 641 PROC_LOCK_ASSERT(p, MA_NOTOWNED); 577 * If sigio is on the list associated with a process or process group, 642 } 578 * disable signalling from the device, remove sigio from the list and 643 579 * free sigio. 644 SIGIO_LOCK(); 580 */ 645 while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { 581 void 646 *(sigio->sio_myref) = NULL; 582 funsetown(sigiop) 647 if (pg != NULL) { 583 struct sigio **sigiop; 648 KASSERT(sigio->sio_pgid < 0, 12/30/03 12:13:19 sys/kern/kern_descrip.c 6 649 ("Proc sigio in pgrp sigio list")); 713 * in another session. 650 KASSERT(sigio->sio_pgrp == pg, 714 * 651 ("Bogus pgrp in sigio list")); 715 * Remove this test to allow maximum flexibility or 652 PGRP_LOCK(pg); 716 * restrict FSETOWN to the current process or process 653 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, 717 * group for maximum safety. 654 sio_pgsigio); 718 */ 655 PGRP_UNLOCK(pg); 719 PROC_UNLOCK(proc); 656 } else /* if (p != NULL) */ { 720 if (proc->p_session != curthread->td_proc->p_session) { 657 KASSERT(sigio->sio_pgid > 0, 721 ret = EPERM; 658 ("Pgrp sigio in proc sigio list")); 722 goto fail; 659 KASSERT(sigio->sio_proc == p, 723 } 660 ("Bogus proc in sigio list")); 724 661 PROC_LOCK(p); 725 pgrp = NULL; 662 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, 726 } else /* if (pgid < 0) */ { 663 sio_pgsigio); 727 pgrp = pgfind(-pgid); 664 PROC_UNLOCK(p); 728 if (pgrp == NULL) { 665 } 729 ret = ESRCH; 666 SIGIO_UNLOCK(); 730 goto fail; 667 crfree(sigio->sio_ucred); 731 } 668 FREE(sigio, M_SIGIO); 732 PGRP_UNLOCK(pgrp); 669 SIGIO_LOCK(); 733 670 } 734 /* 671 SIGIO_UNLOCK(); 735 * Policy - Don’t allow a process to FSETOWN a process 672 } 736 * in another session. 673 737 * 674 /* 738 * Remove this test to allow maximum flexibility or 675 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). 739 * restrict FSETOWN to the current process or process 676 * 740 * group for maximum safety. 677 * After permission checking, add a sigio structure to the sigio list for 741 */ 678 * the process or process group. 742 if (pgrp->pg_session != curthread->td_proc->p_session) { 679 */ 743 ret = EPERM; 680 int 744 goto fail; 681 fsetown(pgid, sigiop) 745 } 682 pid_t pgid; 746 683 struct sigio **sigiop; 747 proc = NULL; 684 { 748 } 685 struct proc *proc; 749 funsetown(sigiop); 686 struct pgrp *pgrp; 750 if (pgid > 0) { 687 struct sigio *sigio; 751 PROC_LOCK(proc); 688 int ret; 752 /* 689 753 * Since funsetownlst() is called without the proctree 690 if (pgid == 0) { 754 * locked, we need to check for P_WEXIT. 691 funsetown(sigiop); 755 * XXX: is ESRCH correct? 692 return (0); 756 */ 693 } 757 if ((proc->p_flag & P_WEXIT) != 0) { 694 758 PROC_UNLOCK(proc); 695 ret = 0; 759 ret = ESRCH; 696 760 goto fail; 697 /* Allocate and fill in the new sigio out of locks. */ 761 } 698 MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK) 762 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); ; 763 sigio->sio_proc = proc; 699 sigio->sio_pgid = pgid; 764 PROC_UNLOCK(proc); 700 sigio->sio_ucred = crhold(curthread->td_ucred); 765 } else { 701 sigio->sio_myref = sigiop; 766 PGRP_LOCK(pgrp); 702 767 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); 703 sx_slock(&proctree_lock); 768 sigio->sio_pgrp = pgrp; 704 if (pgid > 0) { 769 PGRP_UNLOCK(pgrp); 705 proc = pfind(pgid); 770 } 706 if (proc == NULL) { 771 sx_sunlock(&proctree_lock); 707 ret = ESRCH; 772 SIGIO_LOCK(); 708 goto fail; 773 *sigiop = sigio; 709 } 774 SIGIO_UNLOCK(); 710 775 return (0); 711 /* 776 712 * Policy - Don’t allow a process to FSETOWN a process 777 fail: 12/30/03 12:13:19 sys/kern/kern_descrip.c 7 778 sx_sunlock(&proctree_lock); 843 */ 779 crfree(sigio->sio_ucred); 844 fdp->fd_holdleaderscount++; 780 FREE(sigio, M_SIGIO); 845 holdleaders = 1; 781 return (ret); 846 } 782 } 847 783 848 /* 784 /* 849 * we now hold the fp reference that used to be owned by the descripto 785 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). r 786 */ 850 * array. 787 pid_t 851 */ 788 fgetown(sigiop) 852 while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NUL 789 struct sigio **sigiop; L) 790 { 853 fdp->fd_lastfile--; 791 pid_t pgid; 854 if (fd < fdp->fd_freefile) 792 855 fdp->fd_freefile = fd; 793 SIGIO_LOCK(); 856 if (fd < fdp->fd_knlistsize) { 794 pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; 857 FILEDESC_UNLOCK(fdp); 795 SIGIO_UNLOCK(); 858 knote_fdclose(td, fd); 796 return (pgid); 859 } else 797 } 860 FILEDESC_UNLOCK(fdp); 798 861 799 /* 862 error = closef(fp, td); 800 * Close a file descriptor. 863 done2: 801 */ 864 mtx_unlock(&Giant); 802 #ifndef _SYS_SYSPROTO_H_ 865 if (holdleaders) { 803 struct close_args { 866 FILEDESC_LOCK(fdp); 804 int fd; 867 fdp->fd_holdleaderscount--; 805 }; 868 if (fdp->fd_holdleaderscount == 0 && 806 #endif 869 fdp->fd_holdleaderswakeup != 0) { 807 /* 870 fdp->fd_holdleaderswakeup = 0; 808 * MPSAFE 871 wakeup(&fdp->fd_holdleaderscount); 809 */ 872 } 810 /* ARGSUSED */ 873 FILEDESC_UNLOCK(fdp); 811 int 874 } 812 close(td, uap) 875 return (error); 813 struct thread *td; 876 } 814 struct close_args *uap; 877 815 { 878 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 816 struct filedesc *fdp; 879 /* 817 struct file *fp; 880 * Return status information about a file descriptor. 818 int fd, error; 881 */ 819 int holdleaders; 882 #ifndef _SYS_SYSPROTO_H_ 820 883 struct ofstat_args { 821 fd = uap->fd; 884 int fd; 822 error = 0; 885 struct ostat *sb; 823 holdleaders = 0; 886 }; 824 fdp = td->td_proc->p_fd; 887 #endif 825 mtx_lock(&Giant); 888 /* 826 FILEDESC_LOCK(fdp); 889 * MPSAFE 827 if ((unsigned)fd >= fdp->fd_nfiles || 890 */ 828 (fp = fdp->fd_ofiles[fd]) == NULL) { 891 /* ARGSUSED */ 829 FILEDESC_UNLOCK(fdp); 892 int 830 error = EBADF; 893 ofstat(td, uap) 831 goto done2; 894 struct thread *td; 832 } 895 struct ofstat_args *uap; 833 #if 0 896 { 834 if (fdp->fd_ofileflags[fd] & UF_MAPPED) 897 struct file *fp; 835 (void) munmapfd(td, fd); 898 struct stat ub; 836 #endif 899 struct ostat oub; 837 fdp->fd_ofiles[fd] = NULL; 900 int error; 838 fdp->fd_ofileflags[fd] = 0; 901 839 if (td->td_proc->p_fdtol != NULL) { 902 if ((error = fget(td, uap->fd, &fp)) != 0) 840 /* 903 goto done2; 841 * Ask fdfree() to sleep to ensure that all relevant 904 mtx_lock(&Giant); 842 * process leaders can be traversed in closef(). 905 error = fo_stat(fp, &ub, td->td_ucred, td); 12/30/03 12:13:19 sys/kern/kern_descrip.c 8 906 mtx_unlock(&Giant); 971 struct nstat nub; 907 if (error == 0) { 972 int error; 908 cvtstat(&ub, &oub); 973 909 error = copyout(&oub, uap->sb, sizeof(oub)); 974 if ((error = fget(td, uap->fd, &fp)) != 0) 910 } 975 goto done2; 911 fdrop(fp, td); 976 mtx_lock(&Giant); 912 done2: 977 error = fo_stat(fp, &ub, td->td_ucred, td); 913 return (error); 978 mtx_unlock(&Giant); 914 } 979 if (error == 0) { 915 #endif /* COMPAT_43 || COMPAT_SUNOS */ 980 cvtnstat(&ub, &nub); 916 981 error = copyout(&nub, uap->sb, sizeof(nub)); 917 /* 982 } 918 * Return status information about a file descriptor. 983 fdrop(fp, td); 919 */ 984 done2: 920 #ifndef _SYS_SYSPROTO_H_ 985 return (error); 921 struct fstat_args { 986 } 922 int fd; 987 923 struct stat *sb; 988 /* 924 }; 989 * Return pathconf information about a file descriptor. 925 #endif 990 */ 926 /* 991 #ifndef _SYS_SYSPROTO_H_ 927 * MPSAFE 992 struct fpathconf_args { 928 */ 993 int fd; 929 /* ARGSUSED */ 994 int name; 930 int 995 }; 931 fstat(td, uap) 996 #endif 932 struct thread *td; 997 /* 933 struct fstat_args *uap; 998 * MPSAFE 934 { 999 */ 935 struct file *fp; 1000 /* ARGSUSED */ 936 struct stat ub; 1001 int 937 int error; 1002 fpathconf(td, uap) 938 1003 struct thread *td; 939 if ((error = fget(td, uap->fd, &fp)) != 0) 1004 struct fpathconf_args *uap; 940 goto done2; 1005 { 941 mtx_lock(&Giant); 1006 struct file *fp; 942 error = fo_stat(fp, &ub, td->td_ucred, td); 1007 struct vnode *vp; 943 mtx_unlock(&Giant); 1008 int error; 944 if (error == 0) 1009 945 error = copyout(&ub, uap->sb, sizeof(ub)); 1010 if ((error = fget(td, uap->fd, &fp)) != 0) 946 fdrop(fp, td); 1011 return (error); 947 done2: 1012 948 return (error); 1013 /* If asynchronous I/O is available, it works for all descriptors. */ 949 } 1014 if (uap->name == _PC_ASYNC_IO) { 950 1015 td->td_retval[0] = async_io_version; 951 /* 1016 goto out; 952 * Return status information about a file descriptor. 1017 } 953 */ 1018 vp = fp->f_vnode; 954 #ifndef _SYS_SYSPROTO_H_ 1019 if (vp != NULL) { 955 struct nfstat_args { 1020 mtx_lock(&Giant); 956 int fd; 1021 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 957 struct nstat *sb; 1022 error = VOP_PATHCONF(vp, uap->name, td->td_retval); 958 }; 1023 VOP_UNLOCK(vp, 0, td); 959 #endif 1024 mtx_unlock(&Giant); 960 /* 1025 } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { 961 * MPSAFE 1026 if (uap->name != _PC_PIPE_BUF) { 962 */ 1027 error = EINVAL; 963 /* ARGSUSED */ 1028 } else { 964 int 1029 td->td_retval[0] = PIPE_BUF; 965 nfstat(td, uap) 1030 error = 0; 966 struct thread *td; 1031 } 967 struct nfstat_args *uap; 1032 } else { 968 { 1033 error = EOPNOTSUPP; 969 struct file *fp; 1034 } 970 struct stat ub; 1035 out: 12/30/03 12:13:19 sys/kern/kern_descrip.c 9 1036 fdrop(fp, td); 1101 if (fdp->fd_nfiles >= nfiles) { 1037 return (error); 1102 FILEDESC_UNLOCK(fdp); 1038 } 1103 free(newofile, M_FILEDESC); 1039 1104 FILEDESC_LOCK(fdp); 1040 /* 1105 continue; 1041 * Allocate a file descriptor for the process. 1106 } 1042 */ 1107 newofileflags = (char *) &newofile[nfiles]; 1043 static int fdexpand; 1108 /* 1044 SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, ""); 1109 * Copy the existing ofile and ofileflags arrays 1045 1110 * and zero the new portion of each array. 1046 int 1111 */ 1047 fdalloc(td, want, result) 1112 i = fdp->fd_nfiles * sizeof(struct file *); 1048 struct thread *td; 1113 bcopy(fdp->fd_ofiles, newofile, i); 1049 int want; 1114 bzero((char *)newofile + i, 1050 int *result; 1115 nfiles * sizeof(struct file *) - i); 1051 { 1116 i = fdp->fd_nfiles * sizeof(char); 1052 struct proc *p = td->td_proc; 1117 bcopy(fdp->fd_ofileflags, newofileflags, i); 1053 struct filedesc *fdp = td->td_proc->p_fd; 1118 bzero(newofileflags + i, nfiles * sizeof(char) - i); 1054 int i; 1119 if (fdp->fd_nfiles > NDFILE) 1055 int lim, last, nfiles; 1120 oldofile = fdp->fd_ofiles; 1056 struct file **newofile, **oldofile; 1121 else 1057 char *newofileflags; 1122 oldofile = NULL; 1058 1123 fdp->fd_ofiles = newofile; 1059 FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); 1124 fdp->fd_ofileflags = newofileflags; 1060 1125 fdp->fd_nfiles = nfiles; 1061 /* 1126 fdexpand++; 1062 * Search for a free descriptor starting at the higher 1127 if (oldofile != NULL) { 1063 * of want or fd_freefile. If that fails, consider 1128 FILEDESC_UNLOCK(fdp); 1064 * expanding the ofile array. 1129 free(oldofile, M_FILEDESC); 1065 */ 1130 FILEDESC_LOCK(fdp); 1066 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); 1131 } 1067 for (;;) { 1132 } 1068 last = min(fdp->fd_nfiles, lim); 1133 } 1069 i = max(want, fdp->fd_freefile); 1134 1070 for (; i < last; i++) { 1135 /* 1071 if (fdp->fd_ofiles[i] == NULL) { 1136 * Check to see whether n user file descriptors 1072 fdp->fd_ofileflags[i] = 0; 1137 * are available to the process p. 1073 if (i > fdp->fd_lastfile) 1138 */ 1074 fdp->fd_lastfile = i; 1139 int 1075 if (want <= fdp->fd_freefile) 1140 fdavail(td, n) 1076 fdp->fd_freefile = i; 1141 struct thread *td; 1077 *result = i; 1142 int n; 1078 return (0); 1143 { 1079 } 1144 struct proc *p = td->td_proc; 1080 } 1145 struct filedesc *fdp = td->td_proc->p_fd; 1081 1146 struct file **fpp; 1082 /* 1147 int i, lim, last; 1083 * No space in current array. Expand? 1148 1084 */ 1149 FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); 1085 if (i >= lim) 1150 1086 return (EMFILE); 1151 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); 1087 if (fdp->fd_nfiles < NDEXTENT) 1152 if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) 1088 nfiles = NDEXTENT; 1153 return (1); 1089 else 1154 last = min(fdp->fd_nfiles, lim); 1090 nfiles = 2 * fdp->fd_nfiles; 1155 fpp = &fdp->fd_ofiles[fdp->fd_freefile]; 1091 while (nfiles < want) 1156 for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { 1092 nfiles <<= 1; 1157 if (*fpp == NULL && --n <= 0) 1093 FILEDESC_UNLOCK(fdp); 1158 return (1); 1094 newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK); 1159 } 1095 1160 return (0); 1096 /* 1161 } 1097 * Deal with file-table extend race that might have 1162 1098 * occurred while filedesc was unlocked. 1163 /* 1099 */ 1164 * Create a new open file structure and allocate 1100 FILEDESC_LOCK(fdp); 1165 * a file decriptor for the process that refers to it. 12/30/03 12:13:19 sys/kern/kern_descrip.c 10 1166 * We add one reference to the file for the descriptor table 1230 } 1167 * and one reference for resultfp. This is to prevent us being 1231 1168 * prempted and the entry in the descriptor table closed after 1232 /* 1169 * we release the FILEDESC lock. 1233 * Free a file descriptor. 1170 */ 1234 */ 1171 int 1235 void 1172 falloc(td, resultfp, resultfd) 1236 ffree(fp) 1173 struct thread *td; 1237 struct file *fp; 1174 struct file **resultfp; 1238 { 1175 int *resultfd; 1239 1176 { 1240 KASSERT(fp->f_count == 0, ("ffree: fp_fcount not 0!")); 1177 struct proc *p = td->td_proc; 1241 sx_xlock(&filelist_lock); 1178 struct file *fp, *fq; 1242 LIST_REMOVE(fp, f_list); 1179 int error, i; 1243 nfiles--; 1180 int maxuserfiles = maxfiles - (maxfiles / 20); 1244 sx_xunlock(&filelist_lock); 1181 static struct timeval lastfail; 1245 crfree(fp->f_cred); 1182 static int curfail; 1246 uma_zfree(file_zone, fp); 1183 1247 } 1184 fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); 1248 1185 sx_xlock(&filelist_lock); 1249 /* 1186 if ((nfiles >= maxuserfiles && td->td_ucred->cr_ruid != 0) 1250 * Build a new filedesc structure from another. 1187 || nfiles >= maxfiles) { 1251 * Copy the current, root, and jail root vnode references. 1188 if (ppsratecheck(&lastfail, &curfail, 1)) { 1252 */ 1189 printf("kern.maxfiles limit exceeded by uid %i, please 1253 struct filedesc * see tuning(7).\n", 1254 fdinit(fdp) 1190 td->td_ucred->cr_ruid); 1255 struct filedesc *fdp; 1191 } 1256 { 1192 sx_xunlock(&filelist_lock); 1257 struct filedesc0 *newfdp; 1193 uma_zfree(file_zone, fp); 1258 1194 return (ENFILE); 1259 MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0), 1195 } 1260 M_FILEDESC, M_WAITOK | M_ZERO); 1196 nfiles++; 1261 mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); 1197 1262 newfdp->fd_fd.fd_cdir = fdp->fd_cdir; 1198 /* 1263 if (newfdp->fd_fd.fd_cdir) 1199 * If the process has file descriptor zero open, add the new file 1264 VREF(newfdp->fd_fd.fd_cdir); 1200 * descriptor to the list of open files at that point, otherwise 1265 newfdp->fd_fd.fd_rdir = fdp->fd_rdir; 1201 * put it at the front of the list of open files. 1266 if (newfdp->fd_fd.fd_rdir) 1202 */ 1267 VREF(newfdp->fd_fd.fd_rdir); 1203 fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep); 1268 newfdp->fd_fd.fd_jdir = fdp->fd_jdir; 1204 fp->f_count = 1; 1269 if (newfdp->fd_fd.fd_jdir) 1205 if (resultfp) 1270 VREF(newfdp->fd_fd.fd_jdir); 1206 fp->f_count++; 1271 1207 fp->f_cred = crhold(td->td_ucred); 1272 /* Create the file descriptor table. */ 1208 fp->f_ops = &badfileops; 1273 newfdp->fd_fd.fd_refcnt = 1; 1209 FILEDESC_LOCK(p->p_fd); 1274 newfdp->fd_fd.fd_cmask = CMASK; 1210 if ((fq = p->p_fd->fd_ofiles[0])) { 1275 newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; 1211 LIST_INSERT_AFTER(fq, fp, f_list); 1276 newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; 1212 } else { 1277 newfdp->fd_fd.fd_nfiles = NDFILE; 1213 LIST_INSERT_HEAD(&filehead, fp, f_list); 1278 newfdp->fd_fd.fd_knlistsize = -1; 1214 } 1279 return (&newfdp->fd_fd); 1215 sx_xunlock(&filelist_lock); 1280 } 1216 if ((error = fdalloc(td, 0, &i))) { 1281 1217 FILEDESC_UNLOCK(p->p_fd); 1282 /* 1218 fdrop(fp, td); 1283 * Share a filedesc structure. 1219 if (resultfp) 1284 */ 1220 fdrop(fp, td); 1285 struct filedesc * 1221 return (error); 1286 fdshare(fdp) 1222 } 1287 struct filedesc *fdp; 1223 p->p_fd->fd_ofiles[i] = fp; 1288 { 1224 FILEDESC_UNLOCK(p->p_fd); 1289 FILEDESC_LOCK(fdp); 1225 if (resultfp) 1290 fdp->fd_refcnt++; 1226 *resultfp = fp; 1291 FILEDESC_UNLOCK(fdp); 1227 if (resultfd) 1292 return (fdp); 1228 *resultfd = i; 1293 } 1229 return (0); 1294 12/30/03 12:13:19 sys/kern/kern_descrip.c 11 1295 /* 1360 j = newfdp->fd_nfiles; 1296 * Copy a filedesc structure. 1361 while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2) 1297 * A NULL pointer in returns a NULL reference, this is to ease callers, 1362 j /= 2; 1298 * not catch errors. 1363 if (i != j) { 1299 */ 1364 /* 1300 struct filedesc * 1365 * The size of the original table has changed. 1301 fdcopy(fdp) 1366 * Go over once again. 1302 struct filedesc *fdp; 1367 */ 1303 { 1368 FILEDESC_UNLOCK(fdp); 1304 struct filedesc *newfdp; 1369 FREE(newfdp->fd_ofiles, M_FILEDESC); 1305 struct file **fpp; 1370 FILEDESC_LOCK(fdp); 1306 int i, j; 1371 newfdp->fd_lastfile = fdp->fd_lastfile; 1307 1372 newfdp->fd_nfiles = fdp->fd_nfiles; 1308 /* Certain daemons might not have file descriptors. */ 1373 goto retry; 1309 if (fdp == NULL) 1374 } 1310 return (NULL); 1375 newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i]; 1311 1376 } 1312 FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); 1377 newfdp->fd_nfiles = i; 1313 1378 bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **)); 1314 FILEDESC_UNLOCK(fdp); 1379 bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char)); 1315 MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0), 1380 1316 M_FILEDESC, M_WAITOK); 1381 /* 1317 FILEDESC_LOCK(fdp); 1382 * kq descriptors cannot be copied. 1318 bcopy(fdp, newfdp, sizeof(struct filedesc)); 1383 */ 1319 FILEDESC_UNLOCK(fdp); 1384 if (newfdp->fd_knlistsize != -1) { 1320 bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx)); 1385 fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile]; 1321 mtx_init(&newfdp->fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); 1386 for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) { 1322 if (newfdp->fd_cdir) 1387 if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) { 1323 VREF(newfdp->fd_cdir); 1388 *fpp = NULL; 1324 if (newfdp->fd_rdir) 1389 if (i < newfdp->fd_freefile) 1325 VREF(newfdp->fd_rdir); 1390 newfdp->fd_freefile = i; 1326 if (newfdp->fd_jdir) 1391 } 1327 VREF(newfdp->fd_jdir); 1392 if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0) 1328 newfdp->fd_refcnt = 1; 1393 newfdp->fd_lastfile--; 1329 1394 } 1330 /* 1395 newfdp->fd_knlist = NULL; 1331 * If the number of open files fits in the internal arrays 1396 newfdp->fd_knlistsize = -1; 1332 * of the open file structure, use them, otherwise allocate 1397 newfdp->fd_knhash = NULL; 1333 * additional memory for the number of descriptors currently 1398 newfdp->fd_knhashmask = 0; 1334 * in use. 1399 } 1335 */ 1400 1336 FILEDESC_LOCK(fdp); 1401 fpp = newfdp->fd_ofiles; 1337 newfdp->fd_lastfile = fdp->fd_lastfile; 1402 for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) { 1338 newfdp->fd_nfiles = fdp->fd_nfiles; 1403 if (*fpp != NULL) 1339 if (newfdp->fd_lastfile < NDFILE) { 1404 fhold(*fpp); 1340 newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles; 1405 } 1341 newfdp->fd_ofileflags = 1406 return (newfdp); 1342 ((struct filedesc0 *) newfdp)->fd_dfileflags; 1407 } 1343 i = NDFILE; 1408 1344 } else { 1409 /* A mutex to protect the association between a proc and filedesc. */ 1345 /* 1410 struct mtx fdesc_mtx; 1346 * Compute the smallest multiple of NDEXTENT needed 1411 MTX_SYSINIT(fdesc, &fdesc_mtx, "fdesc", MTX_DEF); 1347 * for the file descriptors currently in use, 1412 1348 * allowing the table to shrink. 1413 /* 1349 */ 1414 * Release a filedesc structure. 1350 retry: 1415 */ 1351 i = newfdp->fd_nfiles; 1416 void 1352 while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2) 1417 fdfree(td) 1353 i /= 2; 1418 struct thread *td; 1354 FILEDESC_UNLOCK(fdp); 1419 { 1355 MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE, 1420 struct filedesc *fdp; 1356 M_FILEDESC, M_WAITOK); 1421 struct file **fpp; 1357 FILEDESC_LOCK(fdp); 1422 int i; 1358 newfdp->fd_lastfile = fdp->fd_lastfile; 1423 struct filedesc_to_leader *fdtol; 1359 newfdp->fd_nfiles = fdp->fd_nfiles; 1424 struct file *fp; 12/30/03 12:13:19 sys/kern/kern_descrip.c 12 1425 struct vnode *vp; 1488 msleep(fdtol, &fdp->fd_mtx, 1426 struct flock lf; 1489 PLOCK, "fdlhold", 0); 1427 1490 goto retry; 1428 /* Certain daemons might not have file descriptors. */ 1491 } 1429 fdp = td->td_proc->p_fd; 1492 } 1430 if (fdp == NULL) 1493 fdtol->fdl_refcount--; 1431 return; 1494 if (fdtol->fdl_refcount == 0 && 1432 1495 fdtol->fdl_holdcount == 0) { 1433 /* Check for special need to clear POSIX style locks */ 1496 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; 1434 fdtol = td->td_proc->p_fdtol; 1497 fdtol->fdl_prev->fdl_next = fdtol->fdl_next; 1435 if (fdtol != NULL) { 1498 } else 1436 FILEDESC_LOCK(fdp); 1499 fdtol = NULL; 1437 KASSERT(fdtol->fdl_refcount > 0, 1500 td->td_proc->p_fdtol = NULL; 1438 ("filedesc_to_refcount botch: fdl_refcount=%d", 1501 FILEDESC_UNLOCK(fdp); 1439 fdtol->fdl_refcount)); 1502 if (fdtol != NULL) 1440 if (fdtol->fdl_refcount == 1 && 1503 FREE(fdtol, M_FILEDESC_TO_LEADER); 1441 (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 1504 } 1442 i = 0; 1505 FILEDESC_LOCK(fdp); 1443 fpp = fdp->fd_ofiles; 1506 if (--fdp->fd_refcnt > 0) { 1444 for (i = 0, fpp = fdp->fd_ofiles; 1507 FILEDESC_UNLOCK(fdp); 1445 i < fdp->fd_lastfile; 1508 return; 1446 i++, fpp++) { 1509 } 1447 if (*fpp == NULL || 1510 1448 (*fpp)->f_type != DTYPE_VNODE) 1511 /* 1449 continue; 1512 * We are the last reference to the structure, so we can 1450 fp = *fpp; 1513 * safely assume it will not change out from under us. 1451 fhold(fp); 1514 */ 1452 FILEDESC_UNLOCK(fdp); 1515 FILEDESC_UNLOCK(fdp); 1453 lf.l_whence = SEEK_SET; 1516 fpp = fdp->fd_ofiles; 1454 lf.l_start = 0; 1517 for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { 1455 lf.l_len = 0; 1518 if (*fpp) 1456 lf.l_type = F_UNLCK; 1519 (void) closef(*fpp, td); 1457 vp = fp->f_vnode; 1520 } 1458 (void) VOP_ADVLOCK(vp, 1521 1459 (caddr_t)td->td_proc-> 1522 /* XXX This should happen earlier. */ 1460 p_leader, 1523 mtx_lock(&fdesc_mtx); 1461 F_UNLCK, 1524 td->td_proc->p_fd = NULL; 1462 &lf, 1525 mtx_unlock(&fdesc_mtx); 1463 F_POSIX); 1526 1464 FILEDESC_LOCK(fdp); 1527 if (fdp->fd_nfiles > NDFILE) 1465 fdrop(fp, td); 1528 FREE(fdp->fd_ofiles, M_FILEDESC); 1466 fpp = fdp->fd_ofiles + i; 1529 if (fdp->fd_cdir) 1467 } 1530 vrele(fdp->fd_cdir); 1468 } 1531 if (fdp->fd_rdir) 1469 retry: 1532 vrele(fdp->fd_rdir); 1470 if (fdtol->fdl_refcount == 1) { 1533 if (fdp->fd_jdir) 1471 if (fdp->fd_holdleaderscount > 0 && 1534 vrele(fdp->fd_jdir); 1472 (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) 1535 if (fdp->fd_knlist) { 1536 FREE(fdp->fd_knlist, M_KQUEUE); 1473 /* 1537 if (fdp->fd_knhash) 1474 * close() or do_dup() has cleared a reference 1538 FREE(fdp->fd_knhash, M_KQUEUE); 1475 * in a shared file descriptor table. 1539 mtx_destroy(&fdp->fd_mtx); 1476 */ 1540 FREE(fdp, M_FILEDESC); 1477 fdp->fd_holdleaderswakeup = 1; 1541 } 1478 msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx 1542 , 1543 /* 1479 PLOCK, "fdlhold", 0); 1544 * For setugid programs, we don’t want to people to use that setugidness 1480 goto retry; 1545 * to generate error messages which write to a file which otherwise would 1481 } 1546 * otherwise be off-limits to the process. We check for filesystems where 1482 if (fdtol->fdl_holdcount > 0) { 1547 * the vnode can change out from under us after execve (like [lin]procfs). 1483 /* 1548 * 1484 * Ensure that fdtol->fdl_leader 1549 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is 1485 * remains valid in closef(). 1550 * sufficient. We also don’t for check setugidness since we know we are. 1486 */ 1551 */ 1487 fdtol->fdl_wakeup = 1; 1552 static int 12/30/03 12:13:19 sys/kern/kern_descrip.c 13 1553 is_unsafe(struct file *fp) 1617 1554 { 1618 /* 1555 if (fp->f_type == DTYPE_VNODE) { 1619 * Close any files on exec? 1556 struct vnode *vp = fp->f_vnode; 1620 */ 1557 1621 void 1558 if ((vp->v_vflag & VV_PROCDEP) != 0) 1622 fdcloseexec(td) 1559 return (1); 1623 struct thread *td; 1560 } 1624 { 1561 return (0); 1625 struct filedesc *fdp; 1562 } 1626 int i; 1563 1627 1564 /* 1628 /* Certain daemons might not have file descriptors. */ 1565 * Make this setguid thing safe, if at all possible. 1629 fdp = td->td_proc->p_fd; 1566 */ 1630 if (fdp == NULL) 1567 void 1631 return; 1568 setugidsafety(td) 1632 1569 struct thread *td; 1633 FILEDESC_LOCK(fdp); 1570 { 1634 1571 struct filedesc *fdp; 1635 /* 1572 int i; 1636 * We cannot cache fd_ofiles or fd_ofileflags since operations 1573 1637 * may block and rip them out from under us. 1574 /* Certain daemons might not have file descriptors. */ 1638 */ 1575 fdp = td->td_proc->p_fd; 1639 for (i = 0; i <= fdp->fd_lastfile; i++) { 1576 if (fdp == NULL) 1640 if (fdp->fd_ofiles[i] != NULL && 1577 return; 1641 (fdp->fd_ofileflags[i] & UF_EXCLOSE)) { 1578 1642 struct file *fp; 1579 /* 1643 1580 * Note: fdp->fd_ofiles may be reallocated out from under us while 1644 #if 0 1581 * we are blocked in a close. Be careful! 1645 if (fdp->fd_ofileflags[i] & UF_MAPPED) 1582 */ 1646 (void) munmapfd(td, i); 1583 FILEDESC_LOCK(fdp); 1647 #endif 1584 for (i = 0; i <= fdp->fd_lastfile; i++) { 1648 if (i < fdp->fd_knlistsize) { 1585 if (i > 2) 1649 FILEDESC_UNLOCK(fdp); 1586 break; 1650 knote_fdclose(td, i); 1587 if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { 1651 FILEDESC_LOCK(fdp); 1588 struct file *fp; 1652 } 1589 1653 /* 1590 #if 0 1654 * NULL-out descriptor prior to close to avoid 1591 if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0) 1655 * a race while close blocks. 1592 (void) munmapfd(td, i); 1656 */ 1593 #endif 1657 fp = fdp->fd_ofiles[i]; 1594 if (i < fdp->fd_knlistsize) { 1658 fdp->fd_ofiles[i] = NULL; 1595 FILEDESC_UNLOCK(fdp); 1659 fdp->fd_ofileflags[i] = 0; 1596 knote_fdclose(td, i); 1660 if (i < fdp->fd_freefile) 1597 FILEDESC_LOCK(fdp); 1661 fdp->fd_freefile = i; 1598 } 1662 FILEDESC_UNLOCK(fdp); 1599 /* 1663 (void) closef(fp, td); 1600 * NULL-out descriptor prior to close to avoid 1664 FILEDESC_LOCK(fdp); 1601 * a race while close blocks. 1665 } 1602 */ 1666 } 1603 fp = fdp->fd_ofiles[i]; 1667 while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NUL 1604 fdp->fd_ofiles[i] = NULL; L) 1605 fdp->fd_ofileflags[i] = 0; 1668 fdp->fd_lastfile--; 1606 if (i < fdp->fd_freefile) 1669 FILEDESC_UNLOCK(fdp); 1607 fdp->fd_freefile = i; 1670 } 1608 FILEDESC_UNLOCK(fdp); 1671 1609 (void) closef(fp, td); 1672 /* 1610 FILEDESC_LOCK(fdp); 1673 * It is unsafe for set[ug]id processes to be started with file 1611 } 1674 * descriptors 0..2 closed, as these descriptors are given implicit 1612 } 1675 * significance in the Standard C library. fdcheckstd() will create a 1613 while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NUL 1676 * descriptor referencing /dev/null for each of stdin, stdout, and L) 1677 * stderr that is not already open. 1614 fdp->fd_lastfile--; 1678 */ 1615 FILEDESC_UNLOCK(fdp); 1679 int 1616 } 1680 fdcheckstd(td) 12/30/03 12:13:19 sys/kern/kern_descrip.c 14 1681 struct thread *td; 1745 * Decrement reference count on file structure. 1682 { 1746 * Note: td may be NULL when closing a file 1683 struct nameidata nd; 1747 * that was being passed in a message. 1684 struct filedesc *fdp; 1748 */ 1685 struct file *fp; 1749 int 1686 register_t retval; 1750 closef(fp, td) 1687 int fd, i, error, flags, devnull, extraref; 1751 struct file *fp; 1688 1752 struct thread *td; 1689 fdp = td->td_proc->p_fd; 1753 { 1690 if (fdp == NULL) 1754 struct vnode *vp; 1691 return (0); 1755 struct flock lf; 1692 devnull = -1; 1756 struct filedesc_to_leader *fdtol; 1693 error = 0; 1757 struct filedesc *fdp; 1694 for (i = 0; i < 3; i++) { 1758 1695 if (fdp->fd_ofiles[i] != NULL) 1759 if (fp == NULL) 1696 continue; 1760 return (0); 1697 if (devnull < 0) { 1761 /* 1698 error = falloc(td, &fp, &fd); 1762 * POSIX record locking dictates that any close releases ALL 1699 if (error != 0) 1763 * locks owned by this process. This is handled by setting 1700 break; 1764 * a flag in the unlock to free ONLY locks obeying POSIX 1701 /* Note extra ref on ‘fp’ held for us by falloc(). */ 1765 * semantics, and not to free BSD-style file locks. 1702 KASSERT(fd == i, ("oof, we didn’t get our fd")); 1766 * If the descriptor was in a message, POSIX-style locks 1703 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null", 1767 * aren’t passed with the descriptor. 1704 td); 1768 */ 1705 flags = FREAD | FWRITE; 1769 if (td != NULL && 1706 error = vn_open(&nd, &flags, 0, -1); 1770 fp->f_type == DTYPE_VNODE) { 1707 if (error != 0) { 1771 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { 1708 /* 1772 lf.l_whence = SEEK_SET; 1709 * Someone may have closed the entry in the 1773 lf.l_start = 0; 1710 * file descriptor table, so check it hasn’t 1774 lf.l_len = 0; 1711 * changed before dropping the reference count 1775 lf.l_type = F_UNLCK; . 1776 vp = fp->f_vnode; 1712 */ 1777 (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, 1713 extraref = 0; 1778 F_UNLCK, &lf, F_POSIX); 1714 FILEDESC_LOCK(fdp); 1779 } 1715 if (fdp->fd_ofiles[fd] == fp) { 1780 fdtol = td->td_proc->p_fdtol; 1716 fdp->fd_ofiles[fd] = NULL; 1781 if (fdtol != NULL) { 1717 extraref = 1; 1782 /* 1718 } 1783 * Handle special case where file descriptor table 1719 FILEDESC_UNLOCK(fdp); 1784 * is shared between multiple process leaders. 1720 fdrop(fp, td); 1785 */ 1721 if (extraref) 1786 fdp = td->td_proc->p_fd; 1722 fdrop(fp, td); 1787 FILEDESC_LOCK(fdp); 1723 break; 1788 for (fdtol = fdtol->fdl_next; 1724 } 1789 fdtol != td->td_proc->p_fdtol; 1725 NDFREE(&nd, NDF_ONLY_PNBUF); 1790 fdtol = fdtol->fdl_next) { 1726 fp->f_vnode = nd.ni_vp; 1791 if ((fdtol->fdl_leader->p_flag & 1727 fp->f_data = nd.ni_vp; 1792 P_ADVLOCK) == 0) 1728 fp->f_flag = flags; 1793 continue; 1729 fp->f_ops = &vnops; 1794 fdtol->fdl_holdcount++; 1730 fp->f_type = DTYPE_VNODE; 1795 FILEDESC_UNLOCK(fdp); 1731 VOP_UNLOCK(nd.ni_vp, 0, td); 1796 lf.l_whence = SEEK_SET; 1732 devnull = fd; 1797 lf.l_start = 0; 1733 fdrop(fp, td); 1798 lf.l_len = 0; 1734 } else { 1799 lf.l_type = F_UNLCK; 1735 error = do_dup(td, DUP_FIXED, devnull, i, &retval); 1800 vp = fp->f_vnode; 1736 if (error != 0) 1801 (void) VOP_ADVLOCK(vp, 1737 break; 1802 (caddr_t)fdtol->fdl_leader, 1738 } 1803 F_UNLCK, &lf, F_POSIX); 1739 } 1804 FILEDESC_LOCK(fdp); 1740 return (error); 1805 fdtol->fdl_holdcount--; 1741 } 1806 if (fdtol->fdl_holdcount == 0 && 1742 1807 fdtol->fdl_wakeup != 0) { 1743 /* 1808 fdtol->fdl_wakeup = 0; 1744 * Internal form of close. 1809 wakeup(fdtol); 12/30/03 12:13:19 sys/kern/kern_descrip.c 15 1810 } 1875 FILEDESC_UNLOCK(fdp); 1811 } 1876 return (EINVAL); 1812 FILEDESC_UNLOCK(fdp); 1877 } 1813 } 1878 if (hold) { 1814 } 1879 fhold(fp); 1815 return (fdrop(fp, td)); 1880 FILEDESC_UNLOCK(fdp); 1816 } 1881 } 1817 1882 *fpp = fp; 1818 /* 1883 return (0); 1819 * Drop reference on struct file passed in, may call closef if the 1884 } 1820 * reference hits zero. 1885 1821 */ 1886 int 1822 int 1887 fget(struct thread *td, int fd, struct file **fpp) 1823 fdrop(fp, td) 1888 { 1824 struct file *fp; 1889 1825 struct thread *td; 1890 return(_fget(td, fd, fpp, 0, 1)); 1826 { 1891 } 1827 1892 1828 FILE_LOCK(fp); 1893 int 1829 return (fdrop_locked(fp, td)); 1894 fget_read(struct thread *td, int fd, struct file **fpp) 1830 } 1895 { 1831 1896 1832 /* 1897 return(_fget(td, fd, fpp, FREAD, 1)); 1833 * Extract the file pointer associated with the specified descriptor for 1898 } 1834 * the current user process. 1899 1835 * 1900 int 1836 * If the descriptor doesn’t exist, EBADF is returned. 1901 fget_write(struct thread *td, int fd, struct file **fpp) 1837 * 1902 { 1838 * If the descriptor exists but doesn’t match ’flags’ then 1903 1839 * return EBADF for read attempts and EINVAL for write attempts. 1904 return(_fget(td, fd, fpp, FWRITE, 1)); 1840 * 1905 } 1841 * If ’hold’ is set (non-zero) the file’s refcount will be bumped on return. 1906 1842 * It should be droped with fdrop(). 1907 /* 1843 * If it is not set, then the refcount will not be bumped however the 1908 * Like fget() but loads the underlying vnode, or returns an error if 1844 * thread’s filedesc struct will be returned locked (for fgetsock). 1909 * the descriptor does not represent a vnode. Note that pipes use vnodes 1845 * 1910 * but never have VM objects (so VOP_GETVOBJECT() calls will return an 1846 * If an error occured the non-zero error is returned and *fpp is set to NULL. 1911 * error). The returned vnode will be vref()d. 1847 * Otherwise *fpp is set and zero is returned. 1912 */ 1848 */ 1913 static __inline int 1849 static __inline int 1914 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) 1850 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold) 1915 { 1851 { 1916 struct file *fp; 1852 struct filedesc *fdp; 1917 int error; 1853 struct file *fp; 1918 1854 1919 *vpp = NULL; 1855 *fpp = NULL; 1920 if ((error = _fget(td, fd, &fp, 0, 0)) != 0) 1856 if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) 1921 return (error); 1857 return (EBADF); 1922 if (fp->f_vnode == NULL) { 1858 FILEDESC_LOCK(fdp); 1923 error = EINVAL; 1859 if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) { 1924 } else { 1860 FILEDESC_UNLOCK(fdp); 1925 *vpp = fp->f_vnode; 1861 return (EBADF); 1926 vref(*vpp); 1862 } 1927 } 1863 1928 FILEDESC_UNLOCK(td->td_proc->p_fd); 1864 /* 1929 return (error); 1865 * Note: FREAD failures returns EBADF to maintain backwards 1930 } 1866 * compatibility with what routines returned before. 1931 1867 * 1932 int 1868 * Only one flag, or 0, may be specified. 1933 fgetvp(struct thread *td, int fd, struct vnode **vpp) 1869 */ 1934 { 1870 if (flags == FREAD && (fp->f_flag & FREAD) == 0) { 1935 1871 FILEDESC_UNLOCK(fdp); 1936 return (_fgetvp(td, fd, vpp, 0)); 1872 return (EBADF); 1937 } 1873 } 1938 1874 if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) { 1939 int 12/30/03 12:13:19 sys/kern/kern_descrip.c 16 1940 fgetvp_read(struct thread *td, int fd, struct vnode **vpp) 2005 struct vnode *vp; 1941 { 2006 int error; 1942 2007 1943 return (_fgetvp(td, fd, vpp, FREAD)); 2008 FILE_LOCK_ASSERT(fp, MA_OWNED); 1944 } 2009 1945 2010 if (--fp->f_count > 0) { 1946 int 2011 FILE_UNLOCK(fp); 1947 fgetvp_write(struct thread *td, int fd, struct vnode **vpp) 2012 return (0); 1948 { 2013 } 1949 2014 /* We have the last ref so we can proceed without the file lock. */ 1950 return (_fgetvp(td, fd, vpp, FWRITE)); 2015 FILE_UNLOCK(fp); 1951 } 2016 mtx_lock(&Giant); 1952 2017 if (fp->f_count < 0) 1953 /* 2018 panic("fdrop: count < 0"); 1954 * Like fget() but loads the underlying socket, or returns an error if 2019 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { 1955 * the descriptor does not represent a socket. 2020 lf.l_whence = SEEK_SET; 1956 * 2021 lf.l_start = 0; 1957 * We bump the ref count on the returned socket. XXX Also obtain the SX 2022 lf.l_len = 0; 1958 * lock in the future. 2023 lf.l_type = F_UNLCK; 1959 */ 2024 vp = fp->f_vnode; 1960 int 2025 (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 1961 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) 2026 } 1962 { 2027 if (fp->f_ops != &badfileops) 1963 struct file *fp; 2028 error = fo_close(fp, td); 1964 int error; 2029 else 1965 2030 error = 0; 1966 *spp = NULL; 2031 ffree(fp); 1967 if (fflagp != NULL) 2032 mtx_unlock(&Giant); 1968 *fflagp = 0; 2033 return (error); 1969 if ((error = _fget(td, fd, &fp, 0, 0)) != 0) 2034 } 1970 return (error); 2035 1971 if (fp->f_type != DTYPE_SOCKET) { 2036 /* 1972 error = ENOTSOCK; 2037 * Apply an advisory lock on a file descriptor. 1973 } else { 2038 * 1974 *spp = fp->f_data; 2039 * Just attempt to get a record lock of the requested type on 1975 if (fflagp) 2040 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). 1976 *fflagp = fp->f_flag; 2041 */ 1977 soref(*spp); 2042 #ifndef _SYS_SYSPROTO_H_ 1978 } 2043 struct flock_args { 1979 FILEDESC_UNLOCK(td->td_proc->p_fd); 2044 int fd; 1980 return (error); 2045 int how; 1981 } 2046 }; 1982 2047 #endif 1983 /* 2048 /* 1984 * Drop the reference count on the the socket and XXX release the SX lock in 2049 * MPSAFE 1985 * the future. The last reference closes the socket. 2050 */ 1986 */ 2051 /* ARGSUSED */ 1987 void 2052 int 1988 fputsock(struct socket *so) 2053 flock(td, uap) 1989 { 2054 struct thread *td; 1990 2055 struct flock_args *uap; 1991 sorele(so); 2056 { 1992 } 2057 struct file *fp; 1993 2058 struct vnode *vp; 1994 /* 2059 struct flock lf; 1995 * Drop reference on struct file passed in, may call closef if the 2060 int error; 1996 * reference hits zero. 2061 1997 * Expects struct file locked, and will unlock it. 2062 if ((error = fget(td, uap->fd, &fp)) != 0) 1998 */ 2063 return (error); 1999 int 2064 if (fp->f_type != DTYPE_VNODE) { 2000 fdrop_locked(fp, td) 2065 fdrop(fp, td); 2001 struct file *fp; 2066 return (EOPNOTSUPP); 2002 struct thread *td; 2067 } 2003 { 2068 2004 struct flock lf; 2069 mtx_lock(&Giant); 12/30/03 12:13:19 sys/kern/kern_descrip.c 17 2070 vp = fp->f_vnode; 2135 struct filedesc *fdp; 2071 lf.l_whence = SEEK_SET; 2136 int indx, dfd; 2072 lf.l_start = 0; 2137 int mode; 2073 lf.l_len = 0; 2138 int error; 2074 if (uap->how & LOCK_UN) { 2139 { 2075 lf.l_type = F_UNLCK; 2140 struct file *wfp; 2076 FILE_LOCK(fp); 2141 struct file *fp; 2077 fp->f_flag &= ˜FHASLOCK; 2142 2078 FILE_UNLOCK(fp); 2143 /* 2079 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); 2144 * If the to-be-dup’d fd number is greater than the allowed number 2080 goto done2; 2145 * of file descriptors, or the fd to be dup’d has already been 2081 } 2146 * closed, then reject. 2082 if (uap->how & LOCK_EX) 2147 */ 2083 lf.l_type = F_WRLCK; 2148 FILEDESC_LOCK(fdp); 2084 else if (uap->how & LOCK_SH) 2149 if (dfd < 0 || dfd >= fdp->fd_nfiles || 2085 lf.l_type = F_RDLCK; 2150 (wfp = fdp->fd_ofiles[dfd]) == NULL) { 2086 else { 2151 FILEDESC_UNLOCK(fdp); 2087 error = EBADF; 2152 return (EBADF); 2088 goto done2; 2153 } 2089 } 2154 2090 FILE_LOCK(fp); 2155 /* 2091 fp->f_flag |= FHASLOCK; 2156 * There are two cases of interest here. 2092 FILE_UNLOCK(fp); 2157 * 2093 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 2158 * For ENODEV simply dup (dfd) to file descriptor 2094 (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); 2159 * (indx) and return. 2095 done2: 2160 * 2096 fdrop(fp, td); 2161 * For ENXIO steal away the file structure from (dfd) and 2097 mtx_unlock(&Giant); 2162 * store it in (indx). (dfd) is effectively closed by 2098 return (error); 2163 * this operation. 2099 } 2164 * 2100 2165 * Any other error code is just returned. 2101 /* 2166 */ 2102 * File Descriptor pseudo-device driver (/dev/fd/). 2167 switch (error) { 2103 * 2168 case ENODEV: 2104 * Opening minor device N dup()s the file (if any) connected to file 2169 /* 2105 * descriptor N belonging to the calling process. Note that this driver 2170 * Check that the mode the file is being opened for is a 2106 * consists of only the ‘‘open()’’ routine, because all subsequent 2171 * subset of the mode of the existing descriptor. 2107 * references to this file will be direct to the other driver. 2172 */ 2108 */ 2173 FILE_LOCK(wfp); 2109 /* ARGSUSED */ 2174 if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { 2110 static int 2175 FILE_UNLOCK(wfp); 2111 fdopen(dev, mode, type, td) 2176 FILEDESC_UNLOCK(fdp); 2112 dev_t dev; 2177 return (EACCES); 2113 int mode, type; 2178 } 2114 struct thread *td; 2179 fp = fdp->fd_ofiles[indx]; 2115 { 2180 #if 0 2116 2181 if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED) 2117 /* 2182 (void) munmapfd(td, indx); 2118 * XXX Kludge: set curthread->td_dupfd to contain the value of the 2183 #endif 2119 * the file descriptor being sought for duplication. The error 2184 fdp->fd_ofiles[indx] = wfp; 2120 * return ensures that the vnode for this device will be released 2185 fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; 2121 * by vn_open. Open will detect this special error and take the 2186 fhold_locked(wfp); 2122 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN 2187 FILE_UNLOCK(wfp); 2123 * will simply report the error. 2188 if (indx > fdp->fd_lastfile) 2124 */ 2189 fdp->fd_lastfile = indx; 2125 td->td_dupfd = dev2unit(dev); 2190 if (fp != NULL) 2126 return (ENODEV); 2191 FILE_LOCK(fp); 2127 } 2192 FILEDESC_UNLOCK(fdp); 2128 2193 /* 2129 /* 2194 * We now own the reference to fp that the ofiles[] array 2130 * Duplicate the specified descriptor to a free descriptor. 2195 * used to own. Release it. 2131 */ 2196 */ 2132 int 2197 if (fp != NULL) 2133 dupfdopen(td, fdp, indx, dfd, mode, error) 2198 fdrop_locked(fp, td); 2134 struct thread *td; 2199 return (0); 12/30/03 12:13:19 sys/kern/kern_descrip.c 18 2200 2264 if (old != NULL) { 2201 case ENXIO: 2265 FILEDESC_LOCK(fdp); 2202 /* 2266 fdtol->fdl_next = old->fdl_next; 2203 * Steal away the file pointer from dfd and stuff it into indx 2267 fdtol->fdl_prev = old; . 2268 old->fdl_next = fdtol; 2204 */ 2269 fdtol->fdl_next->fdl_prev = fdtol; 2205 fp = fdp->fd_ofiles[indx]; 2270 FILEDESC_UNLOCK(fdp); 2206 #if 0 2271 } else { 2207 if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED) 2272 fdtol->fdl_next = fdtol; 2208 (void) munmapfd(td, indx); 2273 fdtol->fdl_prev = fdtol; 2209 #endif 2274 } 2210 fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; 2275 return fdtol; 2211 fdp->fd_ofiles[dfd] = NULL; 2276 } 2212 fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; 2277 2213 fdp->fd_ofileflags[dfd] = 0; 2278 /* 2214 2279 * Get file structures. 2215 /* 2280 */ 2216 * Complete the clean up of the filedesc structure by 2281 static int 2217 * recomputing the various hints. 2282 sysctl_kern_file(SYSCTL_HANDLER_ARGS) 2218 */ 2283 { 2219 if (indx > fdp->fd_lastfile) { 2284 struct xfile xf; 2220 fdp->fd_lastfile = indx; 2285 struct filedesc *fdp; 2221 } else { 2286 struct file *fp; 2222 while (fdp->fd_lastfile > 0 && 2287 struct proc *p; 2223 fdp->fd_ofiles[fdp->fd_lastfile] == NULL) { 2288 int error, n; 2224 fdp->fd_lastfile--; 2289 2225 } 2290 /* 2226 if (dfd < fdp->fd_freefile) 2291 * Note: because the number of file descriptors is calculated 2227 fdp->fd_freefile = dfd; 2292 * in different ways for sizing vs returning the data, 2228 } 2293 * there is information leakage from the first loop. However, 2229 if (fp != NULL) 2294 * it is of a similar order of magnitude to the leakage from 2230 FILE_LOCK(fp); 2295 * global system statistics such as kern.openfiles. 2231 FILEDESC_UNLOCK(fdp); 2296 */ 2232 2297 sysctl_wire_old_buffer(req, 0); 2233 /* 2298 if (req->oldptr == NULL) { 2234 * we now own the reference to fp that the ofiles[] array 2299 n = 16; /* A slight overestimate. */ 2235 * used to own. Release it. 2300 sx_slock(&filelist_lock); 2236 */ 2301 LIST_FOREACH(fp, &filehead, f_list) { 2237 if (fp != NULL) 2302 /* 2238 fdrop_locked(fp, td); 2303 * We should grab the lock, but this is an 2239 return (0); 2304 * estimate, so does it really matter? 2240 2305 */ 2241 default: 2306 /* mtx_lock(fp->f_mtxp); */ 2242 FILEDESC_UNLOCK(fdp); 2307 n += fp->f_count; 2243 return (error); 2308 /* mtx_unlock(f->f_mtxp); */ 2244 } 2309 } 2245 /* NOTREACHED */ 2310 sx_sunlock(&filelist_lock); 2246 } 2311 return (SYSCTL_OUT(req, 0, n * sizeof(xf))); 2247 2312 } 2248 2313 error = 0; 2249 struct filedesc_to_leader * 2314 bzero(&xf, sizeof(xf)); 2250 filedesc_to_leader_alloc(struct filedesc_to_leader *old, 2315 xf.xf_size = sizeof(xf); 2251 struct filedesc *fdp, 2316 sx_slock(&allproc_lock); 2252 struct proc *leader) 2317 LIST_FOREACH(p, &allproc, p_list) { 2253 { 2318 PROC_LOCK(p); 2254 struct filedesc_to_leader *fdtol; 2319 if (p_cansee(req->td, p) != 0) { 2255 2320 PROC_UNLOCK(p); 2256 MALLOC(fdtol, struct filedesc_to_leader *, 2321 continue; 2257 sizeof(struct filedesc_to_leader), 2322 } 2258 M_FILEDESC_TO_LEADER, 2323 xf.xf_pid = p->p_pid; 2259 M_WAITOK); 2324 xf.xf_uid = p->p_ucred->cr_uid; 2260 fdtol->fdl_refcount = 1; 2325 PROC_UNLOCK(p); 2261 fdtol->fdl_holdcount = 0; 2326 mtx_lock(&fdesc_mtx); 2262 fdtol->fdl_wakeup = 0; 2327 if ((fdp = p->p_fd) == NULL) { 2263 fdtol->fdl_leader = leader; 2328 mtx_unlock(&fdesc_mtx); 12/30/03 12:13:19 sys/kern/kern_descrip.c 19 2329 continue; 2394 .fo_stat = badfo_stat, 2330 } 2395 .fo_close = badfo_close, 2331 FILEDESC_LOCK(fdp); 2396 }; 2332 for (n = 0; n < fdp->fd_nfiles; ++n) { 2397 2333 if ((fp = fdp->fd_ofiles[n]) == NULL) 2398 static int 2334 continue; 2399 badfo_readwrite(fp, uio, active_cred, flags, td) 2335 xf.xf_fd = n; 2400 struct file *fp; 2336 xf.xf_file = fp; 2401 struct uio *uio; 2337 xf.xf_data = fp->f_data; 2402 struct ucred *active_cred; 2338 xf.xf_type = fp->f_type; 2403 struct thread *td; 2339 xf.xf_count = fp->f_count; 2404 int flags; 2340 xf.xf_msgcount = fp->f_msgcount; 2405 { 2341 xf.xf_offset = fp->f_offset; 2406 2342 xf.xf_flag = fp->f_flag; 2407 return (EBADF); 2343 error = SYSCTL_OUT(req, &xf, sizeof(xf)); 2408 } 2344 if (error) 2409 2345 break; 2410 static int 2346 } 2411 badfo_ioctl(fp, com, data, active_cred, td) 2347 FILEDESC_UNLOCK(fdp); 2412 struct file *fp; 2348 mtx_unlock(&fdesc_mtx); 2413 u_long com; 2349 if (error) 2414 void *data; 2350 break; 2415 struct ucred *active_cred; 2351 } 2416 struct thread *td; 2352 sx_sunlock(&allproc_lock); 2417 { 2353 return (error); 2418 2354 } 2419 return (EBADF); 2355 2420 } 2356 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 2421 2357 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); 2422 static int 2358 2423 badfo_poll(fp, events, active_cred, td) 2359 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 2424 struct file *fp; 2360 &maxfilesperproc, 0, "Maximum files allowed open per process"); 2425 int events; 2361 2426 struct ucred *active_cred; 2362 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 2427 struct thread *td; 2363 &maxfiles, 0, "Maximum number of files"); 2428 { 2364 2429 2365 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 2430 return (0); 2366 &nfiles, 0, "System-wide number of open files"); 2431 } 2367 2432 2368 static void 2433 static int 2369 fildesc_drvinit(void *unused) 2434 badfo_kqfilter(fp, kn) 2370 { 2435 struct file *fp; 2371 dev_t dev; 2436 struct knote *kn; 2372 2437 { 2373 dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0"); 2438 2374 make_dev_alias(dev, "stdin"); 2439 return (0); 2375 dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1"); 2440 } 2376 make_dev_alias(dev, "stdout"); 2441 2377 dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2"); 2442 static int 2378 make_dev_alias(dev, "stderr"); 2443 badfo_stat(fp, sb, active_cred, td) 2379 } 2444 struct file *fp; 2380 2445 struct stat *sb; 2381 static fo_rdwr_t badfo_readwrite; 2446 struct ucred *active_cred; 2382 static fo_ioctl_t badfo_ioctl; 2447 struct thread *td; 2383 static fo_poll_t badfo_poll; 2448 { 2384 static fo_kqfilter_t badfo_kqfilter; 2449 2385 static fo_stat_t badfo_stat; 2450 return (EBADF); 2386 static fo_close_t badfo_close; 2451 } 2387 2452 2388 struct fileops badfileops = { 2453 static int 2389 .fo_read = badfo_readwrite, 2454 badfo_close(fp, td) 2390 .fo_write = badfo_readwrite, 2455 struct file *fp; 2391 .fo_ioctl = badfo_ioctl, 2456 struct thread *td; 2392 .fo_poll = badfo_poll, 2457 { 2393 .fo_kqfilter = badfo_kqfilter, 2458 12/30/03 12:13:19 sys/kern/kern_descrip.c 20 2459 return (EBADF); 2460 } 2461 2462 SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR, 2463 fildesc_drvinit,NULL) 2464 2465 static void filelistinit(void *); 2466 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL) 2467 2468 /* ARGSUSED*/ 2469 static void 2470 filelistinit(dummy) 2471 void *dummy; 2472 { 2473 2474 file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, 2475 NULL, NULL, UMA_ALIGN_PTR, 0); 2476 sx_init(&filelist_lock, "filelist lock"); 2477 mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); 2478 } 11/11/03 19:14:29 sys/kern/kern_exec.c 1 1 /* 65 #include 2 * Copyright (c) 1993, David Greenman 66 #include 3 * All rights reserved. 67 #include 4 * 68 #include 5 * Redistribution and use in source and binary forms, with or without 69 #include 6 * modification, are permitted provided that the following conditions 70 #include 7 * are met: 71 #include 8 * 1. Redistributions of source code must retain the above copyright 72 9 * notice, this list of conditions and the following disclaimer. 73 #include 10 * 2. Redistributions in binary form must reproduce the above copyright 74 11 * notice, this list of conditions and the following disclaimer in the 75 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); 12 * documentation and/or other materials provided with the distribution. 76 13 * 77 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ‘‘AS IS’’ AND 78 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 79 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 80 static int kern_execve(struct thread *td, char *fname, char **argv, 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 81 char **envv, struct mac *mac_p); 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 82 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 83 /* XXX This should be vm_size_t. */ 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 84 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 85 NULL, 0, sysctl_kern_ps_strings, "LU", ""); 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 86 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 87 /* XXX This should be vm_size_t. */ 24 * SUCH DAMAGE. 88 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD, 25 */ 89 NULL, 0, sysctl_kern_usrstack, "LU", ""); 26 90 27 #include 91 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, 28 __FBSDID("$FreeBSD: src/sys/kern/kern_exec.c,v 1.232 2003/11/12 03:14:29 rwats 92 NULL, 0, sysctl_kern_stackprot, "I", ""); on Exp $"); 93 29 94 u_long ps_arg_cache_limit = PAGE_SIZE / 16; 30 #include "opt_ktrace.h" 95 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 31 #include "opt_mac.h" 96 &ps_arg_cache_limit, 0, ""); 32 97 33 #include 98 int ps_argsopen = 1; 34 #include 99 SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, ""); 35 #include 100 36 #include 101 static int 37 #include 102 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) 38 #include 103 { 39 #include 104 struct proc *p; 40 #include 105 41 #include 106 p = curproc; 42 #include 107 return (SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, 43 #include 108 sizeof(p->p_sysent->sv_psstrings))); 44 #include 109 } 45 #include 110 46 #include 111 static int 47 #include 112 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) 48 #include 113 { 49 #include 114 struct proc *p; 50 #include 115 51 #include 116 p = curproc; 52 #include 117 return (SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, 53 #include 118 sizeof(p->p_sysent->sv_usrstack))); 54 #include 119 } 55 #include 120 56 #include 121 static int 57 #include 122 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) 58 #include 123 { 59 #ifdef KTRACE 124 struct proc *p; 60 #include 125 61 #endif 126 p = curproc; 62 127 return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, 63 #include 128 sizeof(p->p_sysent->sv_stackprot))); 64 #include 129 } 11/11/03 19:14:29 sys/kern/kern_exec.c 2 130 195 p->p_flag &= ˜P_SA; 131 /* 196 td->td_mailbox = NULL; 132 * Each of the items is a pointer to a ‘const struct execsw’, hence the 197 thread_single_end(); 133 * double pointer here. 198 } 134 */ 199 p->p_flag |= P_INEXEC; 135 static const struct execsw **execsw; 200 PROC_UNLOCK(p); 136 201 137 /* 202 /* 138 * In-kernel implementation of execve(). All arguments are assumed to be 203 * Initialize part of the common data 139 * userspace pointers from the passed thread. 204 */ 140 * 205 imgp->proc = p; 141 * MPSAFE 206 imgp->userspace_argv = argv; 142 */ 207 imgp->userspace_envv = envv; 143 static int 208 imgp->execlabel = NULL; 144 kern_execve(td, fname, argv, envv, mac_p) 209 imgp->attr = &attr; 145 struct thread *td; 210 imgp->argc = imgp->envc = 0; 146 char *fname; 211 imgp->argv0 = NULL; 147 char **argv; 212 imgp->entry_addr = 0; 148 char **envv; 213 imgp->vmspace_destroyed = 0; 149 struct mac *mac_p; 214 imgp->interpreted = 0; 150 { 215 imgp->interpreter_name[0] = ’\0’; 151 struct proc *p = td->td_proc; 216 imgp->auxargs = NULL; 152 struct nameidata nd, *ndp; 217 imgp->vp = NULL; 153 struct ucred *newcred = NULL, *oldcred; 218 imgp->object = NULL; 154 struct uidinfo *euip; 219 imgp->firstpage = NULL; 155 register_t *stack_base; 220 imgp->ps_strings = 0; 156 int error, len, i; 221 imgp->auxarg_size = 0; 157 struct image_params image_params, *imgp; 222 158 struct vattr attr; 223 #ifdef MAC 159 int (*img_first)(struct image_params *); 224 error = mac_execve_enter(imgp, mac_p); 160 struct pargs *oldargs = NULL, *newargs = NULL; 225 if (error) { 161 struct sigacts *oldsigacts, *newsigacts; 226 mtx_lock(&Giant); 162 #ifdef KTRACE 227 goto exec_fail; 163 struct vnode *tracevp = NULL; 228 } 164 struct ucred *tracecred = NULL; 229 #endif 165 #endif 230 166 struct vnode *textvp = NULL; 231 /* 167 int credential_changing; 232 * Allocate temporary demand zeroed space for argument and 168 int textset; 233 * environment strings 169 #ifdef MAC 234 */ 170 struct label *interplabel = NULL; 235 imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + 171 int will_transition; 236 PAGE_SIZE); 172 #endif 237 if (imgp->stringbase == NULL) { 173 238 error = ENOMEM; 174 imgp = &image_params; 239 mtx_lock(&Giant); 175 240 goto exec_fail; 176 /* 241 } 177 * Lock the process and set the P_INEXEC flag to indicate that 242 imgp->stringp = imgp->stringbase; 178 * it should be left alone until we’re done here. This is 243 imgp->stringspace = ARG_MAX; 179 * necessary to avoid race conditions - e.g. in ptrace() - 244 imgp->image_header = imgp->stringbase + ARG_MAX; 180 * that might allow a local user to illicitly obtain elevated 245 181 * privileges. 246 /* 182 */ 247 * Translate the file name. namei() returns a vnode pointer 183 PROC_LOCK(p); 248 * in ni_vp amoung other things. 184 KASSERT((p->p_flag & P_INEXEC) == 0, 249 */ 185 ("%s(): process already has P_INEXEC flag", __func__)); 250 ndp = &nd; 186 if (p->p_flag & P_SA || p->p_numthreads > 1) { 251 NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, 187 if (thread_single(SINGLE_EXIT)) { 252 UIO_USERSPACE, fname, td); 188 PROC_UNLOCK(p); 253 189 return (ERESTART); /* Try again later. */ 254 mtx_lock(&Giant); 190 } 255 interpret: 191 /* 256 192 * If we get here all other threads are dead, 257 error = namei(ndp); 193 * so unset the associated flags and lose KSE mode. 258 if (error) { 194 */ 259 kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, 11/11/03 19:14:29 sys/kern/kern_exec.c 3 260 ARG_MAX + PAGE_SIZE); 325 */ 261 goto exec_fail; 326 if (imgp->interpreted) { 262 } 327 exec_unmap_first_page(imgp); 263 328 /* 264 imgp->vp = ndp->ni_vp; 329 * VV_TEXT needs to be unset for scripts. There is a short 265 imgp->fname = fname; 330 * period before we determine that something is a script where 266 331 * VV_TEXT will be set. The vnode lock is held over this 267 /* 332 * entire period so nothing should illegitimately be blocked. 268 * Check file permissions (also ’opens’ file) 333 */ 269 */ 334 imgp->vp->v_vflag &= ˜VV_TEXT; 270 error = exec_check_permissions(imgp); 335 /* free name buffer and old vnode */ 271 if (error) 336 NDFREE(ndp, NDF_ONLY_PNBUF); 272 goto exec_fail_dealloc; 337 #ifdef MAC 273 338 interplabel = mac_vnode_label_alloc(); 274 if (VOP_GETVOBJECT(imgp->vp, &imgp->object) == 0) 339 mac_copy_vnode_label(ndp->ni_vp->v_label, interplabel); 275 vm_object_reference(imgp->object); 340 #endif 276 341 vput(ndp->ni_vp); 277 /* 342 vm_object_deallocate(imgp->object); 278 * Set VV_TEXT now so no one can write to the executable while we’re 343 imgp->object = NULL; 279 * activating it. 344 /* set new name to that of the interpreter */ 280 * 345 NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, 281 * Remember if this was set before and unset it in case this is not 346 UIO_SYSSPACE, imgp->interpreter_name, td); 282 * actually an executable image. 347 goto interpret; 283 */ 348 } 284 textset = imgp->vp->v_vflag & VV_TEXT; 349 285 imgp->vp->v_vflag |= VV_TEXT; 350 /* 286 351 * Copy out strings (args and env) and initialize stack base 287 error = exec_map_first_page(imgp); 352 */ 288 if (error) 353 if (p->p_sysent->sv_copyout_strings) 289 goto exec_fail_dealloc; 354 stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); 290 355 else 291 /* 356 stack_base = exec_copyout_strings(imgp); 292 * If the current process has a special image activator it 357 293 * wants to try first, call it. For example, emulating shell 358 /* 294 * scripts differently. 359 * If custom stack fixup routine present for this process 295 */ 360 * let it do the stack setup. 296 error = -1; 361 * Else stuff argument count as first item on stack 297 if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) 362 */ 298 error = img_first(imgp); 363 if (p->p_sysent->sv_fixup) 299 364 (*p->p_sysent->sv_fixup)(&stack_base, imgp); 300 /* 365 else 301 * Loop through the list of image activators, calling each one. 366 suword(--stack_base, imgp->argc); 302 * An activator returns -1 if there is no match, 0 on success, 367 303 * and an error otherwise. 368 /* 304 */ 369 * For security and other reasons, the file descriptor table cannot 305 for (i = 0; error == -1 && execsw[i]; ++i) { 370 * be shared after an exec. 306 if (execsw[i]->ex_imgact == NULL || 371 */ 307 execsw[i]->ex_imgact == img_first) { 372 FILEDESC_LOCK(p->p_fd); 308 continue; 373 if (p->p_fd->fd_refcnt > 1) { 309 } 374 struct filedesc *tmp; 310 error = (*execsw[i]->ex_imgact)(imgp); 375 311 } 376 tmp = fdcopy(td->td_proc->p_fd); 312 377 FILEDESC_UNLOCK(p->p_fd); 313 if (error) { 378 fdfree(td); 314 if (error == -1) { 379 p->p_fd = tmp; 315 if (textset == 0) 380 } else 316 imgp->vp->v_vflag &= ˜VV_TEXT; 381 FILEDESC_UNLOCK(p->p_fd); 317 error = ENOEXEC; 382 318 } 383 /* 319 goto exec_fail_dealloc; 384 * Malloc things before we need locks. 320 } 385 */ 321 386 newcred = crget(); 322 /* 387 euip = uifind(attr.va_uid); 323 * Special interpreter operation, cleanup and loop up to try to 388 i = imgp->endargs - imgp->stringbase; 324 * activate the interpreter. 389 if (ps_arg_cache_limit >= i + sizeof(struct pargs)) 11/11/03 19:14:29 sys/kern/kern_exec.c 4 390 newargs = pargs_alloc(i); 455 #endif 391 456 392 /* close files on exec */ 457 if (credential_changing && 393 fdcloseexec(td); 458 (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && 394 459 (p->p_flag & P_TRACED) == 0) { 395 /* Get a reference to the vnode prior to locking the proc */ 460 /* 396 VREF(ndp->ni_vp); 461 * Turn off syscall tracing for set-id programs, except for 397 462 * root. Record any set-id flags first to make sure that 398 /* 463 * we do not regain any tracing during a possible block. 399 * For security and other reasons, signal handlers cannot 464 */ 400 * be shared after an exec. The new process gets a copy of the old 465 setsugid(p); 401 * handlers. In execsigs(), the new process will have its signals 466 #ifdef KTRACE 402 * reset. 467 if (p->p_tracevp != NULL && suser_cred(oldcred, PRISON_ROOT)) 403 */ { 404 PROC_LOCK(p); 468 mtx_lock(&ktrace_mtx); 405 if (sigacts_shared(p->p_sigacts)) { 469 p->p_traceflag = 0; 406 oldsigacts = p->p_sigacts; 470 tracevp = p->p_tracevp; 407 PROC_UNLOCK(p); 471 p->p_tracevp = NULL; 408 newsigacts = sigacts_alloc(); 472 tracecred = p->p_tracecred; 409 sigacts_copy(newsigacts, oldsigacts); 473 p->p_tracecred = NULL; 410 PROC_LOCK(p); 474 mtx_unlock(&ktrace_mtx); 411 p->p_sigacts = newsigacts; 475 } 412 } else 476 #endif 413 oldsigacts = NULL; 477 /* 414 478 * Close any file descriptors 0..2 that reference procfs, 415 /* Stop profiling */ 479 * then make sure file descriptors 0..2 are in use. 416 stopprofclock(p); 480 * 417 481 * setugidsafety() may call closef() and then pfind() 418 /* reset caught signals */ 482 * which may grab the process lock. 419 execsigs(p); 483 * fdcheckstd() may call falloc() which may block to 420 484 * allocate memory, so temporarily drop the process lock. 421 /* name this process - nameiexec(p, ndp) */ 485 */ 422 len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); 486 PROC_UNLOCK(p); 423 bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); 487 setugidsafety(td); 424 p->p_comm[len] = 0; 488 error = fdcheckstd(td); 425 489 if (error != 0) 426 /* 490 goto done1; 427 * mark as execed, wakeup the process that vforked (if any) and tell 491 PROC_LOCK(p); 428 * it that it now has its own resources back 492 /* 429 */ 493 * Set the new credentials. 430 p->p_flag |= P_EXEC; 494 */ 431 if (p->p_pptr && (p->p_flag & P_PPWAIT)) { 495 crcopy(newcred, oldcred); 432 p->p_flag &= ˜P_PPWAIT; 496 if (attr.va_mode & VSUID) 433 wakeup(p->p_pptr); 497 change_euid(newcred, euip); 434 } 498 if (attr.va_mode & VSGID) 435 499 change_egid(newcred, attr.va_gid); 436 /* 500 #ifdef MAC 437 * Implement image setuid/setgid. 501 if (will_transition) { 438 * 502 mac_execve_transition(oldcred, newcred, imgp->vp, 439 * Don’t honor setuid/setgid if the filesystem prohibits it or if 503 interplabel, imgp); 440 * the process is being traced. 504 } 441 * 505 #endif 442 * XXXMAC: For the time being, use NOSUID to also prohibit 506 /* 443 * transitions on the file system. 507 * Implement correct POSIX saved-id behavior. 444 */ 508 * 445 oldcred = p->p_ucred; 509 * XXXMAC: Note that the current logic will save the 446 credential_changing = 0; 510 * uid and gid if a MAC domain transition occurs, even 447 credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid != 511 * though maybe it shouldn’t. 448 attr.va_uid; 512 */ 449 credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid != 513 change_svuid(newcred, newcred->cr_uid); 450 attr.va_gid; 514 change_svgid(newcred, newcred->cr_gid); 451 #ifdef MAC 515 p->p_ucred = newcred; 452 will_transition = mac_execve_will_transition(oldcred, imgp->vp, 516 newcred = NULL; 453 interplabel, imgp); 517 } else { 454 credential_changing |= will_transition; 518 if (oldcred->cr_uid == oldcred->cr_ruid && 11/11/03 19:14:29 sys/kern/kern_exec.c 5 519 oldcred->cr_gid == oldcred->cr_rgid) 584 exec_setregs(td, imgp->entry_addr, 520 p->p_flag &= ˜P_SUGID; 585 (u_long)(uintptr_t)stack_base, imgp->ps_strings); 521 /* 586 522 * Implement correct POSIX saved-id behavior. 587 done1: 523 * 588 /* 524 * XXX: It’s not clear that the existing behavior is 589 * Free any resources malloc’d earlier that we didn’t use. 525 * POSIX-compliant. A number of sources indicate that the 590 */ 526 * saved uid/gid should only be updated if the new ruid is 591 uifree(euip); 527 * not equal to the old ruid, or the new euid is not equal 592 if (newcred == NULL) 528 * to the old euid and the new euid is not equal to the old 593 crfree(oldcred); 529 * ruid. The FreeBSD code always updates the saved uid/gid. 594 else 530 * Also, this code uses the new (replaced) euid and egid as 595 crfree(newcred); 531 * the source, which may or may not be the right ones to use. 596 /* 532 */ 597 * Handle deferred decrement of ref counts. 533 if (oldcred->cr_svuid != oldcred->cr_uid || 598 */ 534 oldcred->cr_svgid != oldcred->cr_gid) { 599 if (textvp != NULL) 535 crcopy(newcred, oldcred); 600 vrele(textvp); 536 change_svuid(newcred, newcred->cr_uid); 601 if (ndp->ni_vp && error != 0) 537 change_svgid(newcred, newcred->cr_gid); 602 vrele(ndp->ni_vp); 538 p->p_ucred = newcred; 603 #ifdef KTRACE 539 newcred = NULL; 604 if (tracevp != NULL) 540 } 605 vrele(tracevp); 541 } 606 if (tracecred != NULL) 542 607 crfree(tracecred); 543 /* 608 #endif 544 * Store the vp for use in procfs. This vnode was referenced prior 609 if (oldargs != NULL) 545 * to locking the proc lock. 610 pargs_drop(oldargs); 546 */ 611 if (newargs != NULL) 547 textvp = p->p_textvp; 612 pargs_drop(newargs); 548 p->p_textvp = ndp->ni_vp; 613 if (oldsigacts != NULL) 549 614 sigacts_free(oldsigacts); 550 /* 615 551 * Notify others that we exec’d, and clear the P_INEXEC flag 616 exec_fail_dealloc: 552 * as we’re now a bona fide freshly-execed process. 617 553 */ 618 /* 554 KNOTE(&p->p_klist, NOTE_EXEC); 619 * free various allocated resources 555 p->p_flag &= ˜P_INEXEC; 620 */ 556 621 if (imgp->firstpage) 557 /* 622 exec_unmap_first_page(imgp); 558 * If tracing the process, trap to debugger so breakpoints 623 559 * can be set before the program executes. 624 if (imgp->vp) { 560 */ 625 NDFREE(ndp, NDF_ONLY_PNBUF); 561 if (p->p_flag & P_TRACED) 626 vput(imgp->vp); 562 psignal(p, SIGTRAP); 627 } 563 628 564 /* clear "fork but no exec" flag, as we _are_ execing */ 629 if (imgp->stringbase != NULL) 565 p->p_acflag &= ˜AFORK; 630 kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, 566 631 ARG_MAX + PAGE_SIZE); 567 /* Free any previous argument cache */ 632 568 oldargs = p->p_args; 633 if (imgp->object) 569 p->p_args = NULL; 634 vm_object_deallocate(imgp->object); 570 635 571 /* Cache arguments if they fit inside our allowance */ 636 if (error == 0) { 572 if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { 637 /* 573 bcopy(imgp->stringbase, newargs->ar_args, i); 638 * Stop the process here if its stop event mask has 574 p->p_args = newargs; 639 * the S_EXEC bit set. 575 newargs = NULL; 640 */ 576 } 641 STOPEVENT(p, S_EXEC, 0); 577 PROC_UNLOCK(p); 642 goto done2; 578 643 } 579 /* Set values passed into the program in registers. */ 644 580 if (p->p_sysent->sv_setregs) 645 exec_fail: 581 (*p->p_sysent->sv_setregs)(td, imgp->entry_addr, 646 /* we’re done here, clear P_INEXEC */ 582 (u_long)(uintptr_t)stack_base, imgp->ps_strings); 647 PROC_LOCK(p); 583 else 648 p->p_flag &= ˜P_INEXEC; 11/11/03 19:14:29 sys/kern/kern_exec.c 6 649 PROC_UNLOCK(p); 714 char **envv; 650 715 struct mac *mac_p; 651 if (imgp->vmspace_destroyed) { 716 } */ *uap; 652 /* sorry, no more process anymore. exit gracefully */ 717 { 653 #ifdef MAC 718 654 mac_execve_exit(imgp); 719 #ifdef MAC 655 if (interplabel != NULL) 720 return (kern_execve(td, uap->fname, uap->argv, uap->envv, 656 mac_vnode_label_free(interplabel); 721 uap->mac_p)); 657 #endif 722 #else 658 exit1(td, W_EXITCODE(0, SIGABRT)); 723 return (ENOSYS); 659 /* NOT REACHED */ 724 #endif 660 error = 0; 725 } 661 } 726 662 done2: 727 int 663 #ifdef MAC 728 exec_map_first_page(imgp) 664 mac_execve_exit(imgp); 729 struct image_params *imgp; 665 if (interplabel != NULL) 730 { 666 mac_vnode_label_free(interplabel); 731 int rv, i; 667 #endif 732 int initial_pagein; 668 mtx_unlock(&Giant); 733 vm_page_t ma[VM_INITIAL_PAGEIN]; 669 return (error); 734 vm_object_t object; 670 } 735 671 736 GIANT_REQUIRED; 672 #ifndef _SYS_SYSPROTO_H_ 737 673 struct execve_args { 738 if (imgp->firstpage) { 674 char *fname; 739 exec_unmap_first_page(imgp); 675 char **argv; 740 } 676 char **envv; 741 677 }; 742 VOP_GETVOBJECT(imgp->vp, &object); 678 #endif 743 VM_OBJECT_LOCK(object); 679 744 ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); 680 /* 745 if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { 681 * MPSAFE 746 initial_pagein = VM_INITIAL_PAGEIN; 682 */ 747 if (initial_pagein > object->size) 683 int 748 initial_pagein = object->size; 684 execve(td, uap) 749 for (i = 1; i < initial_pagein; i++) { 685 struct thread *td; 750 if ((ma[i] = vm_page_lookup(object, i)) != NULL) { 686 struct execve_args /* { 751 if (ma[i]->valid) 687 char *fname; 752 break; 688 char **argv; 753 vm_page_lock_queues(); 689 char **envv; 754 if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) { 690 } */ *uap; 755 vm_page_unlock_queues(); 691 { 756 break; 692 757 } 693 return (kern_execve(td, uap->fname, uap->argv, uap->envv, NULL)); 758 vm_page_busy(ma[i]); 694 } 759 vm_page_unlock_queues(); 695 760 } else { 696 #ifndef _SYS_SYSPROTO_H_ 761 ma[i] = vm_page_alloc(object, i, 697 struct __mac_execve_args { 762 VM_ALLOC_NORMAL); 698 char *fname; 763 if (ma[i] == NULL) 699 char **argv; 764 break; 700 char **envv; 765 } 701 struct mac *mac_p; 766 } 702 }; 767 initial_pagein = i; 703 #endif 768 rv = vm_pager_get_pages(object, ma, initial_pagein, 0); 704 769 ma[0] = vm_page_lookup(object, 0); 705 /* 770 if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || 706 * MPSAFE 771 (ma[0]->valid == 0)) { 707 */ 772 if (ma[0]) { 708 int 773 vm_page_lock_queues(); 709 __mac_execve(td, uap) 774 pmap_remove_all(ma[0]); 710 struct thread *td; 775 vm_page_free(ma[0]); 711 struct __mac_execve_args /* { 776 vm_page_unlock_queues(); 712 char *fname; 777 } 713 char **argv; 778 VM_OBJECT_UNLOCK(object); 11/11/03 19:14:29 sys/kern/kern_exec.c 7 779 return (EIO); 844 */ 780 } 845 map = &vmspace->vm_map; 781 } 846 if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser && 782 vm_page_lock_queues(); 847 vm_map_max(map) == sv->sv_maxuser) { 783 vm_page_wire(ma[0]); 848 shmexit(vmspace); 784 vm_page_wakeup(ma[0]); 849 vm_page_lock_queues(); 785 vm_page_unlock_queues(); 850 pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map), 786 VM_OBJECT_UNLOCK(object); 851 vm_map_max(map)); 787 852 vm_page_unlock_queues(); 788 pmap_qenter((vm_offset_t)imgp->image_header, ma, 1); 853 vm_map_remove(map, vm_map_min(map), vm_map_max(map)); 789 imgp->firstpage = ma[0]; 854 } else { 790 855 vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser); 791 return (0); 856 vmspace = p->p_vmspace; 792 } 857 map = &vmspace->vm_map; 793 858 } 794 void 859 795 exec_unmap_first_page(imgp) 860 /* Allocate a new stack */ 796 struct image_params *imgp; 861 stack_addr = sv->sv_usrstack - maxssiz; 797 { 862 error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz, 798 GIANT_REQUIRED; 863 sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); 799 864 if (error) 800 if (imgp->firstpage) { 865 return (error); 801 pmap_qremove((vm_offset_t)imgp->image_header, 1); 866 802 vm_page_lock_queues(); 867 #ifdef __ia64__ 803 vm_page_unwire(imgp->firstpage, 1); 868 /* Allocate a new register stack */ 804 vm_page_unlock_queues(); 869 stack_addr = IA64_BACKINGSTORE; 805 imgp->firstpage = NULL; 870 error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz, 806 } 871 sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP); 807 } 872 if (error) 808 873 return (error); 809 /* 874 #endif 810 * Destroy old address space, and allocate a new stack 875 811 * The new stack is only SGROWSIZ large because it is grown 876 /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the 812 * automatically in trap.c. 877 * VM_STACK case, but they are still used to monitor the size of the 813 */ 878 * process stack so we can check the stack rlimit. 814 int 879 */ 815 exec_new_vmspace(imgp, sv) 880 vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; 816 struct image_params *imgp; 881 vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz; 817 struct sysentvec *sv; 882 818 { 883 return (0); 819 int error; 884 } 820 struct proc *p = imgp->proc; 885 821 struct vmspace *vmspace = p->p_vmspace; 886 /* 822 vm_offset_t stack_addr; 887 * Copy out argument and environment strings from the old process 823 vm_map_t map; 888 * address space into the temporary string buffer. 824 889 */ 825 GIANT_REQUIRED; 890 int 826 891 exec_extract_strings(imgp) 827 imgp->vmspace_destroyed = 1; 892 struct image_params *imgp; 828 893 { 829 EVENTHANDLER_INVOKE(process_exec, p); 894 char **argv, **envv; 830 895 char *argp, *envp; 831 /* 896 int error; 832 * Here is as good a place as any to do any resource limit cleanups. 897 size_t length; 833 * This is needed if a 64 bit binary exec’s a 32 bit binary - the 898 834 * data size limit may need to be changed to a value that makes 899 /* 835 * sense for the 32 bit binary. 900 * extract arguments first 836 */ 901 */ 837 if (sv->sv_fixlimits) 902 838 sv->sv_fixlimits(imgp); 903 argv = imgp->userspace_argv; 839 904 840 /* 905 if (argv) { 841 * Blow away entire process VM, if address space not shared, 906 argp = (caddr_t)(intptr_t)fuword(argv); 842 * otherwise, create a new VM space so that other threads are 907 if (argp == (caddr_t)-1) 843 * not disrupted 908 return (EFAULT); 11/11/03 19:14:29 sys/kern/kern_exec.c 8 909 if (argp) 974 /* 910 argv++; 975 * Calculate string base and vector table pointers. 911 if (imgp->argv0) 976 * Also deal with signal trampoline code for this exec type. 912 argp = imgp->argv0; 977 */ 913 if (argp) { 978 p = imgp->proc; 914 do { 979 szsigcode = 0; 915 if (argp == (caddr_t)-1) 980 arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; 916 return (EFAULT); 981 if (p->p_sysent->sv_szsigcode != NULL) 917 if ((error = copyinstr(argp, imgp->stringp, 982 szsigcode = *(p->p_sysent->sv_szsigcode); 918 imgp->stringspace, &length))) { 983 destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - 919 if (error == ENAMETOOLONG) 984 roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); 920 return (E2BIG); 985 921 return (error); 986 /* 922 } 987 * install sigcode 923 imgp->stringspace -= length; 988 */ 924 imgp->stringp += length; 989 if (szsigcode) 925 imgp->argc++; 990 copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo - 926 } while ((argp = (caddr_t)(intptr_t)fuword(argv++))); 991 szsigcode), szsigcode); 927 } 992 928 } 993 /* 929 994 * If we have a valid auxargs ptr, prepare some room 930 imgp->endargs = imgp->stringp; 995 * on the stack. 931 996 */ 932 /* 997 if (imgp->auxargs) { 933 * extract environment strings 998 /* 934 */ 999 * ’AT_COUNT*2’ is size for the ELF Auxargs data. This is for 935 1000 * lower compatibility. 936 envv = imgp->userspace_envv; 1001 */ 937 1002 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : 938 if (envv) { 1003 (AT_COUNT * 2); 939 while ((envp = (caddr_t)(intptr_t)fuword(envv++))) { 1004 /* 940 if (envp == (caddr_t)-1) 1005 * The ’+ 2’ is for the null pointers at the end of each of 941 return (EFAULT); 1006 * the arg and env vector sets,and imgp->auxarg_size is room 942 if ((error = copyinstr(envp, imgp->stringp, 1007 * for argument of Runtime loader. 943 imgp->stringspace, &length))) { 1008 */ 944 if (error == ENAMETOOLONG) 1009 vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 + 945 return (E2BIG); 1010 imgp->auxarg_size) * sizeof(char *)); 946 return (error); 1011 947 } 1012 } else 948 imgp->stringspace -= length; 1013 /* 949 imgp->stringp += length; 1014 * The ’+ 2’ is for the null pointers at the end of each of 950 imgp->envc++; 1015 * the arg and env vector sets 951 } 1016 */ 952 } 1017 vectp = (char **)(destp - (imgp->argc + imgp->envc + 2) * 953 1018 sizeof(char *)); 954 return (0); 1019 955 } 1020 /* 956 1021 * vectp also becomes our initial stack base 957 /* 1022 */ 958 * Copy strings out to the new process address space, constructing 1023 stack_base = (register_t *)vectp; 959 * new arg and env vector tables. Return a pointer to the base 1024 960 * so that it can be used as the initial stack pointer. 1025 stringp = imgp->stringbase; 961 */ 1026 argc = imgp->argc; 962 register_t * 1027 envc = imgp->envc; 963 exec_copyout_strings(imgp) 1028 964 struct image_params *imgp; 1029 /* 965 { 1030 * Copy out strings - arguments and environment. 966 int argc, envc; 1031 */ 967 char **vectp; 1032 copyout(stringp, destp, ARG_MAX - imgp->stringspace); 968 char *stringp, *destp; 1033 969 register_t *stack_base; 1034 /* 970 struct ps_strings *arginfo; 1035 * Fill in "ps_strings" struct for ps, w, etc. 971 struct proc *p; 1036 */ 972 int szsigcode; 1037 suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); 973 1038 suword(&arginfo->ps_nargvstr, argc); 11/11/03 19:14:29 sys/kern/kern_exec.c 9 1039 1104 * file really is executable. 1040 /* 1105 * 3) Insure that the file is a regular file. 1041 * Fill in argument portion of vector table. 1106 */ 1042 */ 1107 if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || 1043 for (; argc > 0; --argc) { 1108 ((attr->va_mode & 0111) == 0) || 1044 suword(vectp++, (long)(intptr_t)destp); 1109 (attr->va_type != VREG)) 1045 while (*stringp++ != 0) 1110 return (EACCES); 1046 destp++; 1111 1047 destp++; 1112 /* 1048 } 1113 * Zero length files can’t be exec’d 1049 1114 */ 1050 /* a null vector table pointer separates the argp’s from the envp’s */ 1115 if (attr->va_size == 0) 1051 suword(vectp++, 0); 1116 return (ENOEXEC); 1052 1117 1053 suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); 1118 /* 1054 suword(&arginfo->ps_nenvstr, envc); 1119 * Check for execute permission to file based on current credentials. 1055 1120 */ 1056 /* 1121 error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); 1057 * Fill in environment portion of vector table. 1122 if (error) 1058 */ 1123 return (error); 1059 for (; envc > 0; --envc) { 1124 1060 suword(vectp++, (long)(intptr_t)destp); 1125 /* 1061 while (*stringp++ != 0) 1126 * Check number of open-for-writes on the file and deny execution 1062 destp++; 1127 * if there are any. 1063 destp++; 1128 */ 1064 } 1129 if (vp->v_writecount) 1065 1130 return (ETXTBSY); 1066 /* end of vector table is a null pointer */ 1131 1067 suword(vectp, 0); 1132 /* 1068 1133 * Call filesystem specific open routine (which does nothing in the 1069 return (stack_base); 1134 * general case). 1070 } 1135 */ 1071 1136 error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1); 1072 /* 1137 return (error); 1073 * Check permissions of file to execute. 1138 } 1074 * Called with imgp->vp locked. 1139 1075 * Return 0 for success or error code on failure. 1140 /* 1076 */ 1141 * Exec handler registration 1077 int 1142 */ 1078 exec_check_permissions(imgp) 1143 int 1079 struct image_params *imgp; 1144 exec_register(execsw_arg) 1080 { 1145 const struct execsw *execsw_arg; 1081 struct vnode *vp = imgp->vp; 1146 { 1082 struct vattr *attr = imgp->attr; 1147 const struct execsw **es, **xs, **newexecsw; 1083 struct thread *td; 1148 int count = 2; /* New slot and trailing NULL */ 1084 int error; 1149 1085 1150 if (execsw) 1086 td = curthread; /* XXXKSE */ 1151 for (es = execsw; *es; es++) 1087 1152 count++; 1088 /* Get file attributes */ 1153 newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); 1089 error = VOP_GETATTR(vp, attr, td->td_ucred, td); 1154 if (newexecsw == NULL) 1090 if (error) 1155 return (ENOMEM); 1091 return (error); 1156 xs = newexecsw; 1092 1157 if (execsw) 1093 #ifdef MAC 1158 for (es = execsw; *es; es++) 1094 error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp); 1159 *xs++ = *es; 1095 if (error) 1160 *xs++ = execsw_arg; 1096 return (error); 1161 *xs = NULL; 1097 #endif 1162 if (execsw) 1098 1163 free(execsw, M_TEMP); 1099 /* 1164 execsw = newexecsw; 1100 * 1) Check if file execution is disabled for the filesystem that this 1165 return (0); 1101 * file resides on. 1166 } 1102 * 2) Insure that at least one execute bit is on - otherwise root 1167 1103 * will always succeed, and we don’t want to happen unless the 1168 int 11/11/03 19:14:29 sys/kern/kern_exec.c 10 1169 exec_unregister(execsw_arg) 1170 const struct execsw *execsw_arg; 1171 { 1172 const struct execsw **es, **xs, **newexecsw; 1173 int count = 1; 1174 1175 if (execsw == NULL) 1176 panic("unregister with no handlers left?\n"); 1177 1178 for (es = execsw; *es; es++) { 1179 if (*es == execsw_arg) 1180 break; 1181 } 1182 if (*es == NULL) 1183 return (ENOENT); 1184 for (es = execsw; *es; es++) 1185 if (*es != execsw_arg) 1186 count++; 1187 newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); 1188 if (newexecsw == NULL) 1189 return (ENOMEM); 1190 xs = newexecsw; 1191 for (es = execsw; *es; es++) 1192 if (*es != execsw_arg) 1193 *xs++ = *es; 1194 *xs = NULL; 1195 if (execsw) 1196 free(execsw, M_TEMP); 1197 execsw = newexecsw; 1198 return (0); 1199 } 11/14/03 10:49:01 sys/kern/kern_exit.c 1 1 /* 65 #include 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 66 #include 3 * The Regents of the University of California. All rights reserved. 67 #include /* for acct_process() function prototype */ 4 * (c) UNIX System Laboratories, Inc. 68 #include 5 * All or some portions of this file are derived from material licensed 69 #include 6 * to the University of California by American Telephone and Telegraph 70 #include 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 71 #include 8 * the permission of UNIX System Laboratories, Inc. 72 #include 9 * 73 #ifdef KTRACE 10 * Redistribution and use in source and binary forms, with or without 74 #include 11 * modification, are permitted provided that the following conditions 75 #endif 12 * are met: 76 13 * 1. Redistributions of source code must retain the above copyright 77 #include 14 * notice, this list of conditions and the following disclaimer. 78 #include 15 * 2. Redistributions in binary form must reproduce the above copyright 79 #include 16 * notice, this list of conditions and the following disclaimer in the 80 #include 17 * documentation and/or other materials provided with the distribution. 81 #include 18 * 3. All advertising materials mentioning features or use of this software 82 #include 19 * must display the following acknowledgement: 83 #include 20 * This product includes software developed by the University of 84 #include 21 * California, Berkeley and its contributors. 85 22 * 4. Neither the name of the University nor the names of its contributors 86 /* Required to be non-static for SysVR4 emulator */ 23 * may be used to endorse or promote products derived from this software 87 MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status"); 24 * without specific prior written permission. 88 25 * 89 static int wait1(struct thread *, struct wait_args *, int); 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 90 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 91 /* 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 92 * exit -- 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 93 * Death of process. 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 94 * 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 95 * MPSAFE 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 96 */ 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 97 void 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 98 sys_exit(struct thread *td, struct sys_exit_args *uap) 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 99 { 36 * SUCH DAMAGE. 100 37 * 101 mtx_lock(&Giant); 38 * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 102 exit1(td, W_EXITCODE(uap->rval, 0)); 39 */ 103 /* NOTREACHED */ 40 104 } 41 #include 105 42 __FBSDID("$FreeBSD: src/sys/kern/kern_exit.c,v 1.218 2003/11/14 18:49:01 cogne 106 /* t Exp $"); 107 * Exit: deallocate address space and other resources, change proc state 43 108 * to zombie, and unlink proc from allproc and parent’s lists. Save exit 44 #include "opt_compat.h" 109 * status and rusage for wait(). Check for child processes and orphan them. 45 #include "opt_ktrace.h" 110 */ 46 #include "opt_mac.h" 111 void 47 112 exit1(struct thread *td, int rv) 48 #include 113 { 49 #include 114 struct proc *p, *nq, *q; 50 #include 115 struct tty *tp; 51 #include 116 struct vnode *ttyvp; 52 #include 117 struct vmspace *vm; 53 #include 118 struct vnode *vtmp; 54 #include 119 #ifdef KTRACE 55 #include 120 struct vnode *tracevp; 56 #include 121 struct ucred *tracecred; 57 #include 122 #endif 58 #include 123 59 #include 124 GIANT_REQUIRED; 60 #include 125 61 #include 126 p = td->td_proc; 62 #include 127 if (p == initproc) { 63 #include 128 printf("init died (signal %d, exit %d)\n", 64 #include 129 WTERMSIG(rv), WEXITSTATUS(rv)); 11/14/03 10:49:01 sys/kern/kern_exit.c 2 130 panic("Going nowhere without my init!"); 194 while (p->p_peers != NULL) 131 } 195 msleep(p, &ppeers_lock, PWAIT, "exit1", 0); 132 196 mtx_unlock(&ppeers_lock); 133 /* 197 } 134 * MUST abort all other threads before proceeding past here. 198 135 */ 199 #ifdef PGINPROF 136 PROC_LOCK(p); 200 vmsizmon(); 137 if (p->p_flag & P_SA || p->p_numthreads > 1) { 201 #endif 138 /* 202 STOPEVENT(p, S_EXIT, rv); 139 * First check if some other thread got here before us.. 203 wakeup(&p->p_stype); /* Wakeup anyone in procfs’ PIOCWAIT */ 140 * if so, act apropriatly, (exit or suspend); 204 141 */ 205 /* 142 thread_suspend_check(0); 206 * Check if any loadable modules need anything done at process exit. 143 207 * e.g. SYSV IPC stuff 144 /* 208 * XXX what if one of these generates an error? 145 * Kill off the other threads. This requires 209 */ 146 * Some co-operation from other parts of the kernel 210 EVENTHANDLER_INVOKE(process_exit, p); 147 * so it may not be instant. 211 148 * With this state set: 212 MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), 149 * Any thread entering the kernel from userspace will 213 M_ZOMBIE, M_WAITOK); 150 * thread_exit() in trap(). Any thread attempting to 214 /* 151 * sleep will return immediatly 215 * If parent is waiting for us to exit or exec, 152 * with EINTR or EWOULDBLOCK, which will hopefully force them 216 * P_PPWAIT is set; we will wakeup the parent below. 153 * to back out to userland, freeing resources as they go, and 217 */ 154 * anything attempting to return to userland will thread_exit( 218 PROC_LOCK(p); ) 219 stopprofclock(p); 155 * from userret(). thread_exit() will unsuspend us 220 p->p_flag &= ˜(P_TRACED | P_PPWAIT); 156 * when the last other thread exits. 221 SIGEMPTYSET(p->p_siglist); 157 */ 222 SIGEMPTYSET(td->td_siglist); 158 if (thread_single(SINGLE_EXIT)) { 223 159 panic ("Exit: Single threading fouled up"); 224 /* 160 } 225 * Stop the real interval timer. If the handler is currently 161 /* 226 * executing, prevent it from rearming itself and let it finish. 162 * All other activity in this process is now stopped. 227 */ 163 * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them) 228 if (timevalisset(&p->p_realtimer.it_value) && 164 * ... 229 callout_stop(&p->p_itcallout) == 0) { 165 * Turn off threading support. 230 timevalclear(&p->p_realtimer.it_interval); 166 */ 231 msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); 167 p->p_flag &= ˜P_SA; 232 KASSERT(!timevalisset(&p->p_realtimer.it_value), 168 thread_single_end(); /* Don’t need this any more. */ 233 ("realtime timer is still armed")); 169 } 234 } 170 /* 235 PROC_UNLOCK(p); 171 * With this state set: 236 172 * Any thread entering the kernel from userspace will thread_exit() 237 /* 173 * in trap(). Any thread attempting to sleep will return immediatly 238 * Reset any sigio structures pointing to us as a result of 174 * with EINTR or EWOULDBLOCK, which will hopefully force them 239 * F_SETOWN with our pid. 175 * to back out to userland, freeing resources as they go, and 240 */ 176 * anything attempting to return to userland will thread_exit() 241 funsetownlst(&p->p_sigiolst); 177 * from userret(). thread_exit() will do a wakeup on p->p_numthreads 242 178 * if it transitions to 1. 243 /* 179 */ 244 * Close open files and release open-file table. 180 245 * This may block! 181 p->p_flag |= P_WEXIT; 246 */ 182 PROC_UNLOCK(p); 247 fdfree(td); 183 248 184 /* Are we a task leader? */ 249 /* 185 if (p == p->p_leader) { 250 * Remove ourself from our leader’s peer list and wake our leader. 186 mtx_lock(&ppeers_lock); 251 */ 187 q = p->p_peers; 252 mtx_lock(&ppeers_lock); 188 while (q != NULL) { 253 if (p->p_leader->p_peers) { 189 PROC_LOCK(q); 254 q = p->p_leader; 190 psignal(q, SIGKILL); 255 while (q->p_peers != p) 191 PROC_UNLOCK(q); 256 q = q->p_peers; 192 q = q->p_peers; 257 q->p_peers = p->p_peers; 193 } 258 wakeup(p->p_leader); 11/14/03 10:49:01 sys/kern/kern_exit.c 3 259 } 323 VOP_REVOKE(ttyvp, REVOKEALL); 260 mtx_unlock(&ppeers_lock); 324 vrele(ttyvp); 261 325 sx_xlock(&proctree_lock); 262 /* The next two chunks should probably be moved to vmspace_exit. */ 326 } 263 vm = p->p_vmspace; 327 } 264 /* 328 if (sp->s_ttyvp) { 265 * Release user portion of address space. 329 ttyvp = sp->s_ttyvp; 266 * This releases references to vnodes, 330 SESS_LOCK(p->p_session); 267 * which could cause I/O if the file has been unlinked. 331 sp->s_ttyvp = NULL; 268 * Need to do this early enough that we can still sleep. 332 SESS_UNLOCK(p->p_session); 269 * Can’t free the entire vmspace as the kernel stack 333 vrele(ttyvp); 270 * may be mapped within that space also. 334 } 271 * 335 /* 272 * Processes sharing the same vmspace may exit in one order, and 336 * s_ttyp is not zero’d; we use this to indicate 273 * get cleaned up by vmspace_exit() in a different order. The 337 * that the session once had a controlling terminal. 274 * last exiting process to reach this point releases as much of 338 * (for logging and informational purposes) 275 * the environment as it can, and the last process cleaned up 339 */ 276 * by vmspace_exit() (which decrements exitingcnt) cleans up the 340 } 277 * remainder. 341 SESS_LOCK(p->p_session); 278 */ 342 sp->s_leader = NULL; 279 ++vm->vm_exitingcnt; 343 SESS_UNLOCK(p->p_session); 280 if (--vm->vm_refcnt == 0) { 344 } 281 shmexit(vm); 345 fixjobc(p, p->p_pgrp, 0); 282 vm_page_lock_queues(); 346 sx_xunlock(&proctree_lock); 283 pmap_remove_pages(vmspace_pmap(vm), vm_map_min(&vm->vm_map), 347 (void)acct_process(td); 284 vm_map_max(&vm->vm_map)); 348 #ifdef KTRACE 285 vm_page_unlock_queues(); 349 /* 286 (void) vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map), 350 * release trace file 287 vm_map_max(&vm->vm_map)); 351 */ 288 } 352 PROC_LOCK(p); 289 353 mtx_lock(&ktrace_mtx); 290 sx_xlock(&proctree_lock); 354 p->p_traceflag = 0; /* don’t trace the vrele() */ 291 if (SESS_LEADER(p)) { 355 tracevp = p->p_tracevp; 292 struct session *sp; 356 p->p_tracevp = NULL; 293 357 tracecred = p->p_tracecred; 294 sp = p->p_session; 358 p->p_tracecred = NULL; 295 if (sp->s_ttyvp) { 359 mtx_unlock(&ktrace_mtx); 296 /* 360 PROC_UNLOCK(p); 297 * Controlling process. 361 if (tracevp != NULL) 298 * Signal foreground pgrp, 362 vrele(tracevp); 299 * drain controlling terminal 363 if (tracecred != NULL) 300 * and revoke access to controlling terminal. 364 crfree(tracecred); 301 */ 365 #endif 302 if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) { 366 /* 303 tp = sp->s_ttyp; 367 * Release reference to text vnode 304 if (sp->s_ttyp->t_pgrp) { 368 */ 305 PGRP_LOCK(sp->s_ttyp->t_pgrp); 369 if ((vtmp = p->p_textvp) != NULL) { 306 pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1 370 p->p_textvp = NULL; ); 371 vrele(vtmp); 307 PGRP_UNLOCK(sp->s_ttyp->t_pgrp); 372 } 308 } 373 309 /* XXX tp should be locked. */ 374 /* 310 sx_xunlock(&proctree_lock); 375 * Release our limits structure. 311 (void) ttywait(tp); 376 */ 312 sx_xlock(&proctree_lock); 377 mtx_assert(&Giant, MA_OWNED); 313 /* 378 if (--p->p_limit->p_refcnt == 0) { 314 * The tty could have been revoked 379 FREE(p->p_limit, M_SUBPROC); 315 * if we blocked. 380 p->p_limit = NULL; 316 */ 381 } 317 if (sp->s_ttyvp) { 382 318 ttyvp = sp->s_ttyvp; 383 /* 319 SESS_LOCK(p->p_session); 384 * Release this thread’s reference to the ucred. The actual proc 320 sp->s_ttyvp = NULL; 385 * reference will stay around until the proc is harvested by 321 SESS_UNLOCK(p->p_session); 386 * wait(). At this point the ucred is immutable (no other threads 322 sx_xunlock(&proctree_lock); 387 * from this proc are around that can change it) so we leave the 11/14/03 10:49:01 sys/kern/kern_exit.c 4 388 * per-thread ucred pointer intact in case it is needed although 453 mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); 389 * in theory nothing should be using it at this point. 454 if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { 390 */ 455 struct proc *pp; 391 crfree(td->td_ucred); 456 392 457 mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); 393 /* 458 pp = p->p_pptr; 394 * Remove proc from allproc queue and pidhash chain. 459 PROC_UNLOCK(pp); 395 * Place onto zombproc. Unlink from parent’s child list. 460 proc_reparent(p, initproc); 396 */ 461 PROC_LOCK(p->p_pptr); 397 sx_xlock(&allproc_lock); 462 /* 398 LIST_REMOVE(p, p_list); 463 * If this was the last child of our parent, notify 399 LIST_INSERT_HEAD(&zombproc, p, p_list); 464 * parent, so in case he was wait(2)ing, he will 400 LIST_REMOVE(p, p_hash); 465 * continue. 401 sx_xunlock(&allproc_lock); 466 */ 402 467 if (LIST_EMPTY(&pp->p_children)) 403 sx_xlock(&proctree_lock); 468 wakeup(pp); 404 q = LIST_FIRST(&p->p_children); 469 } else 405 if (q != NULL) /* only need this if any child is S_ZOMB */ 470 mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); 406 wakeup(initproc); 471 407 for (; q != NULL; q = nq) { 472 if (p->p_sigparent && p->p_pptr != initproc) 408 nq = LIST_NEXT(q, p_sibling); 473 psignal(p->p_pptr, p->p_sigparent); 409 PROC_LOCK(q); 474 else 410 proc_reparent(q, initproc); 475 psignal(p->p_pptr, SIGCHLD); 411 q->p_sigparent = SIGCHLD; 476 PROC_UNLOCK(p->p_pptr); 412 /* 477 413 * Traced processes are killed 478 /* 414 * since their existence means someone is screwing up. 479 * If this is a kthread, then wakeup anyone waiting for it to exit. 415 */ 480 */ 416 if (q->p_flag & P_TRACED) { 481 if (p->p_flag & P_KTHREAD) 417 q->p_flag &= ˜P_TRACED; 482 wakeup(p); 418 psignal(q, SIGKILL); 483 PROC_UNLOCK(p); 419 } 484 420 PROC_UNLOCK(q); 485 /* 421 } 486 * Finally, call machine-dependent code to release the remaining 422 487 * resources including address space. 423 /* 488 * The address space is released by "vmspace_exitfree(p)" in 424 * Save exit status and final rusage info, adding in child rusage 489 * vm_waitproc(). 425 * info and self times. 490 */ 426 */ 491 cpu_exit(td); 427 PROC_LOCK(p); 492 428 p->p_xstat = rv; 493 PROC_LOCK(p); 429 *p->p_ru = p->p_stats->p_ru; 494 PROC_LOCK(p->p_pptr); 430 mtx_lock_spin(&sched_lock); 495 sx_xunlock(&proctree_lock); 431 calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); 496 mtx_lock_spin(&sched_lock); 432 mtx_unlock_spin(&sched_lock); 497 433 ruadd(p->p_ru, &p->p_stats->p_cru); 498 while (mtx_owned(&Giant)) 434 499 mtx_unlock(&Giant); 435 /* 500 436 * Notify interested parties of our demise. 501 /* 437 */ 502 * We have to wait until after acquiring all locks before 438 KNOTE(&p->p_klist, NOTE_EXIT); 503 * changing p_state. If we block on a mutex then we will be 439 /* 504 * back at SRUN when we resume and our parent will never 440 * Just delete all entries in the p_klist. At this point we won’t 505 * harvest us. 441 * report any more events, and there are nasty race conditions that 506 */ 442 * can beat us if we don’t. 507 p->p_state = PRS_ZOMBIE; 443 */ 508 444 while (SLIST_FIRST(&p->p_klist)) 509 wakeup(p->p_pptr); 445 SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext); 510 PROC_UNLOCK(p->p_pptr); 446 511 cnt.v_swtch++; 447 /* 512 binuptime(PCPU_PTR(switchtime)); 448 * Notify parent that we’re gone. If parent has the PS_NOCLDWAIT 513 PCPU_SET(switchticks, ticks); 449 * flag set, or if the handler is set to SIG_IGN, notify process 514 450 * 1 instead (and hope it will handle this situation). 515 cpu_sched_exit(td); /* XXXKSE check if this should be in thread_exit * 451 */ / 452 PROC_LOCK(p->p_pptr); 516 /* 11/14/03 10:49:01 sys/kern/kern_exit.c 5 517 * Allow the scheduler to adjust the priority of the 582 PROC_LOCK(p); 518 * parent when a kseg is exiting. 583 if (uap->pid != WAIT_ANY && 519 */ 584 p->p_pid != uap->pid && p->p_pgid != -uap->pid) { 520 if (p->p_pid != 1) 585 PROC_UNLOCK(p); 521 sched_exit(p->p_pptr, p); 586 continue; 522 587 } 523 /* 588 524 * Make sure the scheduler takes this thread out of its tables etc. 589 /* 525 * This will also release this thread’s reference to the ucred. 590 * This special case handles a kthread spawned by linux_clone 526 * Other thread parts to release include pcb bits and such. 591 * (see linux_misc.c). The linux_wait4 and linux_waitpid 527 */ 592 * functions need to be able to distinguish between waiting 528 thread_exit(); 593 * on a process and waiting on a thread. It is a thread if 529 } 594 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option 530 595 * signifies we want to wait for threads and not processes. 531 #ifdef COMPAT_43 596 */ 532 /* 597 if ((p->p_sigparent != SIGCHLD) ^ 533 * MPSAFE. The dirty work is handled by wait1(). 598 ((uap->options & WLINUXCLONE) != 0)) { 534 */ 599 PROC_UNLOCK(p); 535 int 600 continue; 536 owait(struct thread *td, struct owait_args *uap __unused) 601 } 537 { 602 538 struct wait_args w; 603 nfound++; 539 604 if (p->p_state == PRS_ZOMBIE) { 540 w.options = 0; 605 td->td_retval[0] = p->p_pid; 541 w.rusage = NULL; 606 #ifdef COMPAT_43 542 w.pid = WAIT_ANY; 607 if (compat) 543 w.status = NULL; 608 td->td_retval[1] = p->p_xstat; 544 return (wait1(td, &w, 1)); 609 else 545 } 610 #endif 546 #endif /* COMPAT_43 */ 611 if (uap->status) { 547 612 status = p->p_xstat; /* convert to int */ 548 /* 613 PROC_UNLOCK(p); 549 * MPSAFE. The dirty work is handled by wait1(). 614 if ((error = copyout(&status, 550 */ 615 uap->status, sizeof(status)))) { 551 int 616 sx_xunlock(&proctree_lock); 552 wait4(struct thread *td, struct wait_args *uap) 617 mtx_unlock(&Giant); 553 { 618 return (error); 554 619 } 555 return (wait1(td, uap, 0)); 620 PROC_LOCK(p); 556 } 621 } 557 622 if (uap->rusage) { 558 /* 623 bcopy(p->p_ru, &ru, sizeof(ru)); 559 * MPSAFE 624 PROC_UNLOCK(p); 560 */ 625 if ((error = copyout(&ru, 561 static int 626 uap->rusage, sizeof (struct rusage)))) { 562 wait1(struct thread *td, struct wait_args *uap, int compat) 627 sx_xunlock(&proctree_lock); 563 { 628 mtx_unlock(&Giant); 564 struct rusage ru; 629 return (error); 565 int nfound; 630 } 566 struct proc *p, *q, *t; 631 } else 567 int status, error; 632 PROC_UNLOCK(p); 568 633 /* 569 q = td->td_proc; 634 * If we got the child via a ptrace ’attach’, 570 if (uap->pid == 0) { 635 * we need to give it back to the old parent. 571 PROC_LOCK(q); 636 */ 572 uap->pid = -q->p_pgid; 637 if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) { 573 PROC_UNLOCK(q); 638 PROC_LOCK(p); 574 } 639 p->p_oppid = 0; 575 if (uap->options &˜ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE)) 640 proc_reparent(p, t); 576 return (EINVAL); 641 PROC_UNLOCK(p); 577 mtx_lock(&Giant); 642 psignal(t, SIGCHLD); 578 loop: 643 wakeup(t); 579 nfound = 0; 644 PROC_UNLOCK(t); 580 sx_xlock(&proctree_lock); 645 sx_xunlock(&proctree_lock); 581 LIST_FOREACH(p, &q->p_children, p_sibling) { 646 mtx_unlock(&Giant); 11/14/03 10:49:01 sys/kern/kern_exit.c 6 647 return (0); 712 } 648 } 713 mtx_lock_spin(&sched_lock); 649 714 if (P_SHOULDSTOP(p) && (p->p_suspcount == p->p_numthreads) && 650 /* 715 ((p->p_flag & P_WAITED) == 0) && 651 * Remove other references to this process to ensure 716 (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { 652 * we have an exclusive reference. 717 mtx_unlock_spin(&sched_lock); 653 */ 718 p->p_flag |= P_WAITED; 654 sx_xlock(&allproc_lock); 719 sx_xunlock(&proctree_lock); 655 LIST_REMOVE(p, p_list); /* off zombproc */ 720 td->td_retval[0] = p->p_pid; 656 sx_xunlock(&allproc_lock); 721 #ifdef COMPAT_43 657 LIST_REMOVE(p, p_sibling); 722 if (compat) { 658 leavepgrp(p); 723 td->td_retval[1] = W_STOPCODE(p->p_xstat); 659 sx_xunlock(&proctree_lock); 724 PROC_UNLOCK(p); 660 725 error = 0; 661 /* 726 } else 662 * As a side effect of this lock, we know that 727 #endif 663 * all other writes to this proc are visible now, so 728 if (uap->status) { 664 * no more locking is needed for p. 729 status = W_STOPCODE(p->p_xstat); 665 */ 730 PROC_UNLOCK(p); 666 PROC_LOCK(p); 731 error = copyout(&status, 667 p->p_xstat = 0; /* XXX: why? */ 732 uap->status, sizeof(status)); 668 PROC_UNLOCK(p); 733 } else { 669 PROC_LOCK(q); 734 PROC_UNLOCK(p); 670 ruadd(&q->p_stats->p_cru, p->p_ru); 735 error = 0; 671 PROC_UNLOCK(q); 736 } 672 FREE(p->p_ru, M_ZOMBIE); 737 mtx_unlock(&Giant); 673 p->p_ru = NULL; 738 return (error); 674 739 } 675 /* 740 mtx_unlock_spin(&sched_lock); 676 * Decrement the count of procs running with this uid. 741 if (uap->options & WCONTINUED && (p->p_flag & P_CONTINUED)) { 677 */ 742 sx_xunlock(&proctree_lock); 678 (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); 743 td->td_retval[0] = p->p_pid; 679 744 p->p_flag &= ˜P_CONTINUED; 680 /* 745 PROC_UNLOCK(p); 681 * Free credentials, arguments, and sigacts 746 682 */ 747 if (uap->status) { 683 crfree(p->p_ucred); 748 status = SIGCONT; 684 p->p_ucred = NULL; 749 error = copyout(&status, 685 pargs_drop(p->p_args); 750 uap->status, sizeof(status)); 686 p->p_args = NULL; 751 } else 687 sigacts_free(p->p_sigacts); 752 error = 0; 688 p->p_sigacts = NULL; 753 689 754 mtx_unlock(&Giant); 690 /* 755 return (error); 691 * do any thread-system specific cleanups 756 } 692 */ 757 PROC_UNLOCK(p); 693 thread_wait(p); 758 } 694 759 if (nfound == 0) { 695 /* 760 sx_xunlock(&proctree_lock); 696 * Give vm and machine-dependent layer a chance 761 mtx_unlock(&Giant); 697 * to free anything that cpu_exit couldn’t 762 return (ECHILD); 698 * release while still running in process context. 763 } 699 */ 764 if (uap->options & WNOHANG) { 700 vm_waitproc(p); 765 sx_xunlock(&proctree_lock); 701 #ifdef MAC 766 td->td_retval[0] = 0; 702 mac_destroy_proc(p); 767 mtx_unlock(&Giant); 703 #endif 768 return (0); 704 KASSERT(FIRST_THREAD_IN_PROC(p), 769 } 705 ("wait1: no residual thread!")); 770 PROC_LOCK(q); 706 uma_zfree(proc_zone, p); 771 sx_xunlock(&proctree_lock); 707 sx_xlock(&allproc_lock); 772 error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0); 708 nprocs--; 773 PROC_UNLOCK(q); 709 sx_xunlock(&allproc_lock); 774 if (error) { 710 mtx_unlock(&Giant); 775 mtx_unlock(&Giant); 711 return (0); 776 return (error); 11/14/03 10:49:01 sys/kern/kern_exit.c 7 777 } 778 goto loop; 779 } 780 781 /* 782 * Make process ’parent’ the new parent of process ’child’. 783 * Must be called with an exclusive hold of proctree lock. 784 */ 785 void 786 proc_reparent(struct proc *child, struct proc *parent) 787 { 788 789 sx_assert(&proctree_lock, SX_XLOCKED); 790 PROC_LOCK_ASSERT(child, MA_OWNED); 791 if (child->p_pptr == parent) 792 return; 793 794 LIST_REMOVE(child, p_sibling); 795 LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); 796 child->p_pptr = parent; 797 } 10/29/03 07:23:09 sys/kern/kern_fork.c 1 1 /* 65 #include 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 66 #include 3 * The Regents of the University of California. All rights reserved. 67 #include 4 * (c) UNIX System Laboratories, Inc. 68 #include 5 * All or some portions of this file are derived from material licensed 69 #include 6 * to the University of California by American Telephone and Telegraph 70 #include 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 71 #include 8 * the permission of UNIX System Laboratories, Inc. 72 9 * 73 #include 10 * Redistribution and use in source and binary forms, with or without 74 #include 11 * modification, are permitted provided that the following conditions 75 #include 12 * are met: 76 #include 13 * 1. Redistributions of source code must retain the above copyright 77 #include 14 * notice, this list of conditions and the following disclaimer. 78 15 * 2. Redistributions in binary form must reproduce the above copyright 79 #include 16 * notice, this list of conditions and the following disclaimer in the 80 #include 17 * documentation and/or other materials provided with the distribution. 81 18 * 3. All advertising materials mentioning features or use of this software 82 #ifndef _SYS_SYSPROTO_H_ 19 * must display the following acknowledgement: 83 struct fork_args { 20 * This product includes software developed by the University of 84 int dummy; 21 * California, Berkeley and its contributors. 85 }; 22 * 4. Neither the name of the University nor the names of its contributors 86 #endif 23 * may be used to endorse or promote products derived from this software 87 24 * without specific prior written permission. 88 static int forksleep; /* Place for fork1() to sleep on. */ 25 * 89 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 90 /* 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 91 * MPSAFE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 92 */ 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 93 /* ARGSUSED */ 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 94 int 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 95 fork(td, uap) 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 96 struct thread *td; 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 97 struct fork_args *uap; 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 98 { 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 99 int error; 36 * SUCH DAMAGE. 100 struct proc *p2; 37 * 101 38 * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 102 error = fork1(td, RFFDG | RFPROC, 0, &p2); 39 */ 103 if (error == 0) { 40 104 td->td_retval[0] = p2->p_pid; 41 #include 105 td->td_retval[1] = 0; 42 __FBSDID("$FreeBSD: src/sys/kern/kern_fork.c,v 1.208 2003/10/29 15:23:09 bde E 106 } xp $"); 107 return (error); 43 108 } 44 #include "opt_ktrace.h" 109 45 #include "opt_mac.h" 110 /* 46 111 * MPSAFE 47 #include 112 */ 48 #include 113 /* ARGSUSED */ 49 #include 114 int 50 #include 115 vfork(td, uap) 51 #include 116 struct thread *td; 52 #include 117 struct vfork_args *uap; 53 #include 118 { 54 #include 119 int error; 55 #include 120 struct proc *p2; 56 #include 121 57 #include 122 error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2); 58 #include 123 if (error == 0) { 59 #include 124 td->td_retval[0] = p2->p_pid; 60 #include 125 td->td_retval[1] = 0; 61 #include 126 } 62 #include 127 return (error); 63 #include 128 } 64 #include 129 10/29/03 07:23:09 sys/kern/kern_fork.c 2 130 /* 194 struct thread *td; 131 * MPSAFE 195 int flags; 132 */ 196 int pages; 133 int 197 struct proc **procp; 134 rfork(td, uap) 198 { 135 struct thread *td; 199 struct proc *p1, *p2, *pptr; 136 struct rfork_args *uap; 200 uid_t uid; 137 { 201 struct proc *newproc; 138 int error; 202 int ok, trypid; 139 struct proc *p2; 203 static int curfail, pidchecked = 0; 140 204 static struct timeval lastfail; 141 /* Don’t allow kernel only flags. */ 205 struct filedesc *fd; 142 if ((uap->flags & RFKERNELONLY) != 0) 206 struct filedesc_to_leader *fdtol; 143 return (EINVAL); 207 struct thread *td2; 144 error = fork1(td, uap->flags, 0, &p2); 208 struct kse *ke2; 145 if (error == 0) { 209 struct ksegrp *kg2; 146 td->td_retval[0] = p2 ? p2->p_pid : 0; 210 struct sigacts *newsigacts; 147 td->td_retval[1] = 0; 211 int error; 148 } 212 149 return (error); 213 /* Can’t copy and clear. */ 150 } 214 if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) 151 215 return (EINVAL); 152 int nprocs = 1; /* process 0 */ 216 153 int lastpid = 0; 217 p1 = td->td_proc; 154 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 218 mtx_lock(&Giant); 155 "Last used PID"); 219 156 220 /* 157 /* 221 * Here we don’t create a new process, but we divorce 158 * Random component to lastpid generation. We mix in a random factor to make 222 * certain parts of a process from itself. 159 * it a little harder to predict. We sanity check the modulus value to avoid 223 */ 160 * doing it in critical paths. Don’t let it be too small or we pointlessly 224 if ((flags & RFPROC) == 0) { 161 * waste randomness entropy, and don’t let it be impossibly large. Using a 225 vm_forkproc(td, NULL, NULL, flags); 162 * modulus that is too big causes a LOT more process table scans and slows 226 163 * down fork processing as the pidchecked caching is defeated. 227 /* 164 */ 228 * Close all file descriptors. 165 static int randompid = 0; 229 */ 166 230 if (flags & RFCFDG) { 167 static int 231 struct filedesc *fdtmp; 168 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) 232 fdtmp = fdinit(td->td_proc->p_fd); 169 { 233 fdfree(td); 170 int error, pid; 234 p1->p_fd = fdtmp; 171 235 } 172 sysctl_wire_old_buffer(req, sizeof(int)); 236 173 sx_xlock(&allproc_lock); 237 /* 174 pid = randompid; 238 * Unshare file descriptors (from parent.) 175 error = sysctl_handle_int(oidp, &pid, 0, req); 239 */ 176 if (error == 0 && req->newptr != NULL) { 240 if (flags & RFFDG) { 177 if (pid < 0 || pid > PID_MAX - 100) /* out of range */ 241 FILEDESC_LOCK(p1->p_fd); 178 pid = PID_MAX - 100; 242 if (p1->p_fd->fd_refcnt > 1) { 179 else if (pid < 2) /* NOP */ 243 struct filedesc *newfd; 180 pid = 0; 244 181 else if (pid < 100) /* Make it reasonable 245 newfd = fdcopy(td->td_proc->p_fd); */ 246 FILEDESC_UNLOCK(p1->p_fd); 182 pid = 100; 247 fdfree(td); 183 randompid = pid; 248 p1->p_fd = newfd; 184 } 249 } else 185 sx_xunlock(&allproc_lock); 250 FILEDESC_UNLOCK(p1->p_fd); 186 return (error); 251 } 187 } 252 mtx_unlock(&Giant); 188 253 *procp = NULL; 189 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 254 return (0); 190 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); 255 } 191 256 192 int 257 /* 193 fork1(td, flags, pages, procp) 258 * Note 1:1 allows for forking with one thread coming out on the 10/29/03 07:23:09 sys/kern/kern_fork.c 3 259 * other side with the expectation that the process is about to 323 nprocs++; 260 * exec. 324 261 */ 325 /* 262 if (p1->p_flag & P_SA) { 326 * Find an unused process ID. We remember a range of unused IDs 263 /* 327 * ready to use (from lastpid+1 through pidchecked-1). 264 * Idle the other threads for a second. 328 * 265 * Since the user space is copied, it must remain stable. 329 * If RFHIGHPID is set (used during system boot), do not allocate 266 * In addition, all threads (from the user perspective) 330 * low-numbered pids. 267 * need to either be suspended or in the kernel, 331 */ 268 * where they will try restart in the parent and will 332 trypid = lastpid + 1; 269 * be aborted in the child. 333 if (flags & RFHIGHPID) { 270 */ 334 if (trypid < 10) 271 PROC_LOCK(p1); 335 trypid = 10; 272 if (thread_single(SINGLE_NO_EXIT)) { 336 } else { 273 /* Abort.. someone else is single threading before us 337 if (randompid) */ 338 trypid += arc4random() % randompid; 274 PROC_UNLOCK(p1); 339 } 275 mtx_unlock(&Giant); 340 retry: 276 return (ERESTART); 341 /* 277 } 342 * If the process ID prototype has wrapped around, 278 PROC_UNLOCK(p1); 343 * restart somewhat above 0, as the low-numbered procs 279 /* 344 * tend to include daemons that don’t exit. 280 * All other activity in this process 345 */ 281 * is now suspended at the user boundary, 346 if (trypid >= PID_MAX) { 282 * (or other safe places if we think of any). 347 trypid = trypid % PID_MAX; 283 */ 348 if (trypid < 100) 284 } 349 trypid += 100; 285 350 pidchecked = 0; 286 /* Allocate new proc. */ 351 } 287 newproc = uma_zalloc(proc_zone, M_WAITOK); 352 if (trypid >= pidchecked) { 288 #ifdef MAC 353 int doingzomb = 0; 289 mac_init_proc(newproc); 354 290 #endif 355 pidchecked = PID_MAX; 291 356 /* 292 /* 357 * Scan the active and zombie procs to check whether this pid 293 * Although process entries are dynamically created, we still keep 358 * is in use. Remember the lowest pid that’s greater 294 * a global limit on the maximum number we will create. Don’t allow 359 * than trypid, so we can avoid checking for a while. 295 * a nonprivileged user to use the last ten processes; don’t let root 360 */ 296 * exceed the limit. The variable nprocs is the current number of 361 p2 = LIST_FIRST(&allproc); 297 * processes, maxproc is the limit. 362 again: 298 */ 363 for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) { 299 sx_xlock(&allproc_lock); 364 PROC_LOCK(p2); 300 uid = td->td_ucred->cr_ruid; 365 while (p2->p_pid == trypid || 301 if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) { 366 p2->p_pgrp->pg_id == trypid || 302 error = EAGAIN; 367 p2->p_session->s_sid == trypid) { 303 goto fail; 368 trypid++; 304 } 369 if (trypid >= pidchecked) { 305 370 PROC_UNLOCK(p2); 306 /* 371 goto retry; 307 * Increment the count of procs running with this uid. Don’t allow 372 } 308 * a nonprivileged user to exceed their current limit. 373 } 309 */ 374 if (p2->p_pid > trypid && pidchecked > p2->p_pid) 310 PROC_LOCK(p1); 375 pidchecked = p2->p_pid; 311 ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 376 if (p2->p_pgrp->pg_id > trypid && 312 (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0); 377 pidchecked > p2->p_pgrp->pg_id) 313 PROC_UNLOCK(p1); 378 pidchecked = p2->p_pgrp->pg_id; 314 if (!ok) { 379 if (p2->p_session->s_sid > trypid && 315 error = EAGAIN; 380 pidchecked > p2->p_session->s_sid) 316 goto fail; 381 pidchecked = p2->p_session->s_sid; 317 } 382 PROC_UNLOCK(p2); 318 383 } 319 /* 384 if (!doingzomb) { 320 * Increment the nprocs resource before blocking can occur. There 385 doingzomb = 1; 321 * are hard-limits as to the number of processes that can run. 386 p2 = LIST_FIRST(&zombproc); 322 */ 387 goto again; 10/29/03 07:23:09 sys/kern/kern_fork.c 4 388 } 453 * Start by zeroing the section of proc that is zero-initialized, 389 } 454 * then copy the section that is copied directly from the parent. 390 455 */ 391 /* 456 td2 = FIRST_THREAD_IN_PROC(p2); 392 * RFHIGHPID does not mess with the lastpid counter during boot. 457 kg2 = FIRST_KSEGRP_IN_PROC(p2); 393 */ 458 ke2 = FIRST_KSE_IN_KSEGRP(kg2); 394 if (flags & RFHIGHPID) 459 395 pidchecked = 0; 460 /* Allocate and switch to an alternate kstack if specified */ 396 else 461 if (pages != 0) 397 lastpid = trypid; 462 vm_thread_new_altkstack(td2, pages); 398 463 399 p2 = newproc; 464 PROC_LOCK(p2); 400 p2->p_state = PRS_NEW; /* protect against others */ 465 PROC_LOCK(p1); 401 p2->p_pid = trypid; 466 402 LIST_INSERT_HEAD(&allproc, p2, p_list); 467 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start) 403 LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); ) 404 sx_xunlock(&allproc_lock); 468 405 469 bzero(&p2->p_startzero, 406 /* 470 (unsigned) RANGEOF(struct proc, p_startzero, p_endzero)); 407 * Malloc things while we don’t hold any locks. 471 bzero(&ke2->ke_startzero, 408 */ 472 (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero)); 409 if (flags & RFSIGSHARE) 473 bzero(&td2->td_startzero, 410 newsigacts = NULL; 474 (unsigned) RANGEOF(struct thread, td_startzero, td_endzero)); 411 else 475 bzero(&kg2->kg_startzero, 412 newsigacts = sigacts_alloc(); 476 (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero)); 413 477 414 /* 478 bcopy(&p1->p_startcopy, &p2->p_startcopy, 415 * Copy filedesc. 479 (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy)); 416 */ 480 bcopy(&td->td_startcopy, &td2->td_startcopy, 417 if (flags & RFCFDG) { 481 (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy)); 418 fd = fdinit(td->td_proc->p_fd); 482 bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy, 419 fdtol = NULL; 483 (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy)); 420 } else if (flags & RFFDG) { 484 #undef RANGEOF 421 FILEDESC_LOCK(p1->p_fd); 485 422 fd = fdcopy(td->td_proc->p_fd); 486 /* Set up the thread as an active thread (as if runnable). */ 423 FILEDESC_UNLOCK(p1->p_fd); 487 ke2->ke_state = KES_THREAD; 424 fdtol = NULL; 488 ke2->ke_thread = td2; 425 } else { 489 td2->td_kse = ke2; 426 fd = fdshare(p1->p_fd); 490 427 if (p1->p_fdtol == NULL) 491 /* 428 p1->p_fdtol = 492 * Duplicate sub-structures as needed. 429 filedesc_to_leader_alloc(NULL, 493 * Increase reference counts on shared objects. 430 NULL, 494 * The p_stats substruct is set in vm_forkproc. 431 p1->p_leader); 495 */ 432 if ((flags & RFTHREAD) != 0) { 496 p2->p_flag = 0; 433 /* 497 if (p1->p_flag & P_PROFIL) 434 * Shared file descriptor table and 498 startprofclock(p2); 435 * shared process leaders. 499 mtx_lock_spin(&sched_lock); 436 */ 500 p2->p_sflag = PS_INMEM; 437 fdtol = p1->p_fdtol; 501 /* 438 FILEDESC_LOCK(p1->p_fd); 502 * Allow the scheduler to adjust the priority of the child and 439 fdtol->fdl_refcount++; 503 * parent while we hold the sched_lock. 440 FILEDESC_UNLOCK(p1->p_fd); 504 */ 441 } else { 505 sched_fork(p1, p2); 442 /* 506 443 * Shared file descriptor table, and 507 mtx_unlock_spin(&sched_lock); 444 * different process leaders 508 p2->p_ucred = crhold(td->td_ucred); 445 */ 509 td2->td_ucred = crhold(p2->p_ucred); /* XXXKSE */ 446 fdtol = filedesc_to_leader_alloc(p1->p_fdtol, 510 447 p1->p_fd, 511 pargs_hold(p2->p_args); 448 p2); 512 449 } 513 if (flags & RFSIGSHARE) { 450 } 514 p2->p_sigacts = sigacts_hold(p1->p_sigacts); 451 /* 515 } else { 452 * Make a proc table entry for the new process. 516 sigacts_copy(newsigacts, p1->p_sigacts); 10/29/03 07:23:09 sys/kern/kern_fork.c 5 517 p2->p_sigacts = newsigacts; 582 */ 518 } 583 p2->p_flag |= p1->p_flag & (P_ALTSTACK | P_SUGID); 519 if (flags & RFLINUXTHPN) 584 SESS_LOCK(p1->p_session); 520 p2->p_sigparent = SIGUSR1; 585 if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) 521 else 586 p2->p_flag |= P_CONTROLT; 522 p2->p_sigparent = SIGCHLD; 587 SESS_UNLOCK(p1->p_session); 523 588 if (flags & RFPPWAIT) 524 /* Bump references to the text vnode (for procfs) */ 589 p2->p_flag |= P_PPWAIT; 525 p2->p_textvp = p1->p_textvp; 590 526 if (p2->p_textvp) 591 LIST_INSERT_AFTER(p1, p2, p_pglist); 527 VREF(p2->p_textvp); 592 PGRP_UNLOCK(p1->p_pgrp); 528 p2->p_fd = fd; 593 LIST_INIT(&p2->p_children); 529 p2->p_fdtol = fdtol; 594 530 PROC_UNLOCK(p1); 595 callout_init(&p2->p_itcallout, CALLOUT_MPSAFE); 531 PROC_UNLOCK(p2); 596 532 597 #ifdef KTRACE 533 /* 598 /* 534 * p_limit is copy-on-write, bump refcnt, 599 * Copy traceflag and tracefile if enabled. 535 */ 600 */ 536 p2->p_limit = p1->p_limit; 601 mtx_lock(&ktrace_mtx); 537 p2->p_limit->p_refcnt++; 602 KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode")); 538 603 if (p1->p_traceflag & KTRFAC_INHERIT) { 539 /* 604 p2->p_traceflag = p1->p_traceflag; 540 * Setup linkage for kernel based threading 605 if ((p2->p_tracevp = p1->p_tracevp) != NULL) { 541 */ 606 VREF(p2->p_tracevp); 542 if((flags & RFTHREAD) != 0) { 607 KASSERT(p1->p_tracecred != NULL, 543 mtx_lock(&ppeers_lock); 608 ("ktrace vnode with no cred")); 544 p2->p_peers = p1->p_peers; 609 p2->p_tracecred = crhold(p1->p_tracecred); 545 p1->p_peers = p2; 610 } 546 p2->p_leader = p1->p_leader; 611 } 547 mtx_unlock(&ppeers_lock); 612 mtx_unlock(&ktrace_mtx); 548 PROC_LOCK(p1->p_leader); 613 #endif 549 if ((p1->p_leader->p_flag & P_WEXIT) != 0) { 614 550 PROC_UNLOCK(p1->p_leader); 615 /* 551 /* 616 * If PF_FORK is set, the child process inherits the 552 * The task leader is exiting, so process p1 is 617 * procfs ioctl flags from its parent. 553 * going to be killed shortly. Since p1 obviously 618 */ 554 * isn’t dead yet, we know that the leader is either 619 if (p1->p_pfsflags & PF_FORK) { 555 * sending SIGKILL’s to all the processes in this 620 p2->p_stops = p1->p_stops; 556 * task or is sleeping waiting for all the peers to 621 p2->p_pfsflags = p1->p_pfsflags; 557 * exit. We let p1 complete the fork, but we need 622 } 558 * to go ahead and kill the new process p2 since 623 559 * the task leader may not get a chance to send 624 /* 560 * SIGKILL to it. We leave it on the list so that 625 * This begins the section where we must prevent the parent 561 * the task leader will wait for this new process 626 * from being swapped. 562 * to commit suicide. 627 */ 563 */ 628 _PHOLD(p1); 564 PROC_LOCK(p2); 629 PROC_UNLOCK(p1); 565 psignal(p2, SIGKILL); 630 566 PROC_UNLOCK(p2); 631 /* 567 } else 632 * Attach the new process to its parent. 568 PROC_UNLOCK(p1->p_leader); 633 * 569 } else { 634 * If RFNOWAIT is set, the newly created process becomes a child 570 p2->p_peers = NULL; 635 * of init. This effectively disassociates the child from the 571 p2->p_leader = p2; 636 * parent. 572 } 637 */ 573 638 if (flags & RFNOWAIT) 574 sx_xlock(&proctree_lock); 639 pptr = initproc; 575 PGRP_LOCK(p1->p_pgrp); 640 else 576 PROC_LOCK(p2); 641 pptr = p1; 577 PROC_LOCK(p1); 642 p2->p_pptr = pptr; 578 643 LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); 579 /* 644 sx_xunlock(&proctree_lock); 580 * Preserve some more flags in subprocess. P_PROFIL has already 645 581 * been preserved. 646 /* Inform accounting that we have forked. */ 10/29/03 07:23:09 sys/kern/kern_fork.c 6 647 p2->p_acflag = AFORK; 712 PROC_LOCK(p2); 648 PROC_UNLOCK(p2); 713 while (p2->p_flag & P_PPWAIT) 649 714 msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0); 650 /* 715 PROC_UNLOCK(p2); 651 * Finish creating the child process. It will return via a different 716 652 * execution path later. (ie: directly into user mode) 717 /* 653 */ 718 * If other threads are waiting, let them continue now 654 vm_forkproc(td, p2, td2, flags); 719 */ 655 720 if (p1->p_flag & P_SA) { 656 if (flags == (RFFDG | RFPROC)) { 721 PROC_LOCK(p1); 657 cnt.v_forks++; 722 thread_single_end(); 658 cnt.v_forkpages += p2->p_vmspace->vm_dsize + 723 PROC_UNLOCK(p1); 659 p2->p_vmspace->vm_ssize; 724 } 660 } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { 725 661 cnt.v_vforks++; 726 /* 662 cnt.v_vforkpages += p2->p_vmspace->vm_dsize + 727 * Return child proc pointer to parent. 663 p2->p_vmspace->vm_ssize; 728 */ 664 } else if (p1 == &proc0) { 729 mtx_unlock(&Giant); 665 cnt.v_kthreads++; 730 *procp = p2; 666 cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + 731 return (0); 667 p2->p_vmspace->vm_ssize; 732 fail: 668 } else { 733 if (ppsratecheck(&lastfail, &curfail, 1)) 669 cnt.v_rforks++; 734 printf("maxproc limit exceeded by uid %i, please see tuning(7) 670 cnt.v_rforkpages += p2->p_vmspace->vm_dsize + and login.conf(5).\n", 671 p2->p_vmspace->vm_ssize; 735 uid); 672 } 736 sx_xunlock(&allproc_lock); 673 737 uma_zfree(proc_zone, newproc); 674 /* 738 if (p1->p_flag & P_SA) { 675 * Both processes are set up, now check if any loadable modules want 739 PROC_LOCK(p1); 676 * to adjust anything. 740 thread_single_end(); 677 * What if they have an error? XXX 741 PROC_UNLOCK(p1); 678 */ 742 } 679 EVENTHANDLER_INVOKE(process_fork, p1, p2, flags); 743 tsleep(&forksleep, PUSER, "fork", hz / 2); 680 744 mtx_unlock(&Giant); 681 /* 745 return (error); 682 * If RFSTOPPED not requested, make child runnable and add to 746 } 683 * run queue. 747 684 */ 748 /* 685 microuptime(&p2->p_stats->p_start); 749 * Handle the return of a child process from fork1(). This function 686 if ((flags & RFSTOPPED) == 0) { 750 * is called from the MD fork_trampoline() entry point. 687 mtx_lock_spin(&sched_lock); 751 */ 688 p2->p_state = PRS_NORMAL; 752 void 689 TD_SET_CAN_RUN(td2); 753 fork_exit(callout, arg, frame) 690 setrunqueue(td2); 754 void (*callout)(void *, struct trapframe *); 691 mtx_unlock_spin(&sched_lock); 755 void *arg; 692 } 756 struct trapframe *frame; 693 757 { 694 /* 758 struct proc *p; 695 * Now can be swapped. 759 struct thread *td; 696 */ 760 697 PROC_LOCK(p1); 761 /* 698 _PRELE(p1); 762 * Processes normally resume in mi_switch() after being 699 763 * cpu_switch()’ed to, but when children start up they arrive here 700 /* 764 * instead, so we must do much the same things as mi_switch() would. 701 * Tell any interested parties about the new process. 765 */ 702 */ 766 703 KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); 767 if ((td = PCPU_GET(deadthread))) { 704 768 PCPU_SET(deadthread, NULL); 705 PROC_UNLOCK(p1); 769 thread_stash(td); 706 770 } 707 /* 771 td = curthread; 708 * Preserve synchronization semantics of vfork. If waiting for 772 p = td->td_proc; 709 * child to exec or exit, set P_PPWAIT on child, and sleep on our 773 td->td_oncpu = PCPU_GET(cpuid); 710 * proc (in case of exit). 774 p->p_state = PRS_NORMAL; 711 */ 775 10/29/03 07:23:09 sys/kern/kern_fork.c 7 776 /* 777 * Finish setting up thread glue so that it begins execution in a 778 * non-nested critical section with sched_lock held but not recursed. 779 */ 780 sched_lock.mtx_lock = (uintptr_t)td; 781 mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); 782 cpu_critical_fork_exit(); 783 CTR3(KTR_PROC, "fork_exit: new thread %p (pid %d, %s)", td, p->p_pid, 784 p->p_comm); 785 mtx_unlock_spin(&sched_lock); 786 787 /* 788 * cpu_set_fork_handler intercepts this function call to 789 * have this call a non-return function to stay in kernel mode. 790 * initproc has its own fork handler, but it does return. 791 */ 792 KASSERT(callout != NULL, ("NULL callout in fork_exit")); 793 callout(arg, frame); 794 795 /* 796 * Check if a kernel thread misbehaved and returned from its main 797 * function. 798 */ 799 PROC_LOCK(p); 800 if (p->p_flag & P_KTHREAD) { 801 PROC_UNLOCK(p); 802 mtx_lock(&Giant); 803 printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", 804 p->p_comm, p->p_pid); 805 kthread_exit(0); 806 } 807 PROC_UNLOCK(p); 808 #ifdef DIAGNOSTIC 809 cred_free_thread(td); 810 #endif 811 mtx_assert(&Giant, MA_NOTOWNED); 812 } 813 814 /* 815 * Simplified back end of syscall(), used when returning from fork() 816 * directly into user mode. Giant is not held on entry, and must not 817 * be held on return. This function is passed in to fork_exit() as the 818 * first parameter and is called when returning to a new userland process. 819 */ 820 void 821 fork_return(td, frame) 822 struct thread *td; 823 struct trapframe *frame; 824 { 825 826 userret(td, frame, 0); 827 #ifdef KTRACE 828 if (KTRPOINT(td, KTR_SYSRET)) 829 ktrsysret(SYS_fork, 0, 0); 830 #endif 831 mtx_assert(&Giant, MA_NOTOWNED); 832 } 10/16/03 01:39:15 sys/kern/kern_proc.c 1 1 /* 524 /* 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 525 * Check p’s parent to see whether p qualifies its own process 3 * The Regents of the University of California. All rights reserved. 526 * group; if so, adjust count for p’s process group. 4 * 527 */ 5 * Redistribution and use in source and binary forms, with or without 528 mysession = pgrp->pg_session; 6 * modification, are permitted provided that the following conditions 529 if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && 7 * are met: 530 hispgrp->pg_session == mysession) 8 * 1. Redistributions of source code must retain the above copyright 531 pgadjustjobc(pgrp, entering); 9 * notice, this list of conditions and the following disclaimer. 532 10 * 2. Redistributions in binary form must reproduce the above copyright 533 /* 11 * notice, this list of conditions and the following disclaimer in the 534 * Check this process’ children to see whether they qualify 12 * documentation and/or other materials provided with the distribution. 535 * their process groups; if so, adjust counts for children’s 13 * 3. All advertising materials mentioning features or use of this software 536 * process groups. 14 * must display the following acknowledgement: 537 */ 15 * This product includes software developed by the University of 538 LIST_FOREACH(p, &p->p_children, p_sibling) { 16 * California, Berkeley and its contributors. 539 hispgrp = p->p_pgrp; 17 * 4. Neither the name of the University nor the names of its contributors 540 if (hispgrp == pgrp || 18 * may be used to endorse or promote products derived from this software 541 hispgrp->pg_session != mysession) 19 * without specific prior written permission. 542 continue; 20 * 543 PROC_LOCK(p); 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 544 if (p->p_state == PRS_ZOMBIE) { 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 545 PROC_UNLOCK(p); 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 546 continue; 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 547 } 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 548 PROC_UNLOCK(p); 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 549 pgadjustjobc(hispgrp, entering); 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 550 } 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 551 } 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 34 * $FreeBSD: src/sys/kern/kern_proc.c,v 1.197 2003/10/16 08:39:15 jeff Exp $ 35 */ 36 37 #include 38 __FBSDID("$FreeBSD: src/sys/kern/kern_proc.c,v 1.197 2003/10/16 08:39:15 jeff Exp $");

500 /* 501 * Adjust pgrp jobc counters when specified process changes process group. 502 * We count the number of processes in each process group that "qualify" 503 * the group for terminal job control (those with a parent in a different 504 * process group of the same session). If that count reaches zero, the 505 * process group becomes orphaned. Check both the specified process’ 506 * process group and that of its children. 507 * entering == 0 => p is leaving specified group. 508 * entering == 1 => p is entering specified group. 509 */ 510 void 511 fixjobc(p, pgrp, entering) 512 register struct proc *p; 513 register struct pgrp *pgrp; 514 int entering; 515 { 516 register struct pgrp *hispgrp; 517 register struct session *mysession; 518 519 sx_assert(&proctree_lock, SX_LOCKED); 520 PROC_LOCK_ASSERT(p, MA_NOTOWNED); 521 PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); 522 SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); 523 10/26/03 23:15:47 sys/kern/kern_resource.c 1 1 /*- 887 * There’s a chance someone created our uidinfo while we 2 * Copyright (c) 1982, 1986, 1991, 1993 888 * were in malloc and not holding the lock, so we have to 3 * The Regents of the University of California. All rights reserved. 889 * make sure we don’t insert a duplicate uidinfo 4 * (c) UNIX System Laboratories, Inc. 890 */ 5 * All or some portions of this file are derived from material licensed 891 if ((old_uip = uilookup(uid)) != NULL) { 6 * to the University of California by American Telephone and Telegraph 892 /* someone else beat us to it */ 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 893 free(uip, M_UIDINFO); 8 * the permission of UNIX System Laboratories, Inc. 894 uip = old_uip; 9 * 895 } else { 10 * Redistribution and use in source and binary forms, with or without 896 uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep); 11 * modification, are permitted provided that the following conditions 897 uip->ui_uid = uid; 12 * are met: 898 LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash); 13 * 1. Redistributions of source code must retain the above copyright 899 } 14 * notice, this list of conditions and the following disclaimer. 900 } 15 * 2. Redistributions in binary form must reproduce the above copyright 901 uihold(uip); 16 * notice, this list of conditions and the following disclaimer in the 902 mtx_unlock(&uihashtbl_mtx); 17 * documentation and/or other materials provided with the distribution. 903 return (uip); 18 * 3. All advertising materials mentioning features or use of this software 904 } 19 * must display the following acknowledgement: 905 20 * This product includes software developed by the University of 906 /* 21 * California, Berkeley and its contributors. 907 * Place another refcount on a uidinfo struct. 22 * 4. Neither the name of the University nor the names of its contributors 908 */ 23 * may be used to endorse or promote products derived from this software 909 void 24 * without specific prior written permission. 910 uihold(uip) 25 * 911 struct uidinfo *uip; 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 912 { 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 913 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 914 UIDINFO_LOCK(uip); 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 915 uip->ui_ref++; 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 916 UIDINFO_UNLOCK(uip); 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 917 } 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 918 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 919 /*- 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 920 * Since uidinfo structs have a long lifetime, we use an 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 921 * opportunistic refcounting scheme to avoid locking the lookup hash 36 * SUCH DAMAGE. 922 * for each release. 37 * 923 * 38 * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 924 * If the refcount hits 0, we need to free the structure, 39 */ 925 * which means we need to lock the hash. 40 926 * Optimal case: 41 #include 927 * After locking the struct and lowering the refcount, if we find 42 __FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.128 2003/10/27 07:15:47 j 928 * that we don’t need to free, simply unlock and return. eff Exp $"); 929 * Suboptimal case: 930 * If refcount lowering results in need to free, bump the count 931 * back up, loose the lock and aquire the locks in the proper 867 /* 932 * order to try again. 868 * Find or allocate a struct uidinfo for a particular uid. 933 */ 869 * Increase refcount on uidinfo struct returned. 934 void 870 * uifree() should be called on a struct uidinfo when released. 935 uifree(uip) 871 */ 936 struct uidinfo *uip; 872 struct uidinfo * 937 { 873 uifind(uid) 938 874 uid_t uid; 939 /* Prepare for optimal case. */ 875 { 940 UIDINFO_LOCK(uip); 876 struct uidinfo *uip; 941 877 942 if (--uip->ui_ref != 0) { 878 mtx_lock(&uihashtbl_mtx); 943 UIDINFO_UNLOCK(uip); 879 uip = uilookup(uid); 944 return; 880 if (uip == NULL) { 945 } 881 struct uidinfo *old_uip; 946 882 947 /* Prepare for suboptimal case. */ 883 mtx_unlock(&uihashtbl_mtx); 948 uip->ui_ref++; 884 uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO); 949 UIDINFO_UNLOCK(uip); 885 mtx_lock(&uihashtbl_mtx); 950 mtx_lock(&uihashtbl_mtx); 886 /* 951 UIDINFO_LOCK(uip); 10/26/03 23:15:47 sys/kern/kern_resource.c 2 952 953 /* 954 * We must subtract one from the count again because we backed out 955 * our initial subtraction before dropping the lock. 956 * Since another thread may have added a reference after we dropped th e 957 * initial lock we have to test for zero again. 958 */ 959 if (--uip->ui_ref == 0) { 960 LIST_REMOVE(uip, ui_hash); 961 mtx_unlock(&uihashtbl_mtx); 962 if (uip->ui_sbsize != 0) 963 /* XXX no %qd in kernel. Truncate. */ 964 printf("freeing uidinfo: uid = %d, sbsize = %ld\n", 965 uip->ui_uid, (long)uip->ui_sbsize); 966 if (uip->ui_proccnt != 0) 967 printf("freeing uidinfo: uid = %d, proccnt = %ld\n", 968 uip->ui_uid, uip->ui_proccnt); 969 UIDINFO_UNLOCK(uip); 970 FREE(uip, M_UIDINFO); 971 return; 972 } 973 974 mtx_unlock(&uihashtbl_mtx); 975 UIDINFO_UNLOCK(uip); 976 } 977 978 /* 979 * Change the count associated with number of processes 980 * a given user is using. When ’max’ is 0, don’t enforce a limit 981 */ 982 int 983 chgproccnt(uip, diff, max) 984 struct uidinfo *uip; 985 int diff; 986 int max; 987 { 988 989 UIDINFO_LOCK(uip); 990 /* don’t allow them to exceed max, but allow subtraction */ 991 if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) { 992 UIDINFO_UNLOCK(uip); 993 return (0); 994 } 995 uip->ui_proccnt += diff; 996 if (uip->ui_proccnt < 0) 997 printf("negative proccnt for uid = %d\n", uip->ui_uid); 998 UIDINFO_UNLOCK(uip); 999 return (1); 1000 } 10/29/03 18:55:43 sys/kern/kern_sig.c 1 1 /* 599 SIGDELSET(ps->ps_sigcatch, sig); 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 600 if (sigprop(sig) & SA_IGNORE) { 3 * The Regents of the University of California. All rights reserved. 601 if (sig != SIGCONT) 4 * (c) UNIX System Laboratories, Inc. 602 SIGADDSET(ps->ps_sigignore, sig); 5 * All or some portions of this file are derived from material licensed 603 SIGDELSET(p->p_siglist, sig); 6 * to the University of California by American Telephone and Telegraph 604 /* 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 605 * There is only one thread at this point. 8 * the permission of UNIX System Laboratories, Inc. 606 */ 9 * 607 SIGDELSET(FIRST_THREAD_IN_PROC(p)->td_siglist, sig); 10 * Redistribution and use in source and binary forms, with or without 608 } 11 * modification, are permitted provided that the following conditions 609 ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; 12 * are met: 610 } 13 * 1. Redistributions of source code must retain the above copyright 611 /* 14 * notice, this list of conditions and the following disclaimer. 612 * Reset stack state to the user stack. 15 * 2. Redistributions in binary form must reproduce the above copyright 613 * Clear set of signals caught on the signal stack. 16 * notice, this list of conditions and the following disclaimer in the 614 */ 17 * documentation and/or other materials provided with the distribution. 615 p->p_sigstk.ss_flags = SS_DISABLE; 18 * 3. All advertising materials mentioning features or use of this software 616 p->p_sigstk.ss_size = 0; 19 * must display the following acknowledgement: 617 p->p_sigstk.ss_sp = 0; 20 * This product includes software developed by the University of 618 p->p_flag &= ˜P_ALTSTACK; 21 * California, Berkeley and its contributors. 619 /* 22 * 4. Neither the name of the University nor the names of its contributors 620 * Reset no zombies if child dies flag as Solaris does. 23 * may be used to endorse or promote products derived from this software 621 */ 24 * without specific prior written permission. 622 ps->ps_flag &= ˜(PS_NOCLDWAIT | PS_CLDSIGIGN); 25 * 623 if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 624 ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 625 mtx_unlock(&ps->ps_mtx); 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 626 } 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 39 */ 40 41 #include 42 __FBSDID("$FreeBSD: src/sys/kern/kern_sig.c,v 1.262 2003/10/30 02:55:43 davidx u Exp $");

579 /* 580 * Reset signals for an exec of the specified process. 581 */ 582 void 583 execsigs(p) 584 register struct proc *p; 585 { 586 register struct sigacts *ps; 587 register int sig; 588 589 /* 590 * Reset caught signals. Held signals remain held 591 * through td_sigmask (unless they were caught, 592 * and are now ignored by default). 593 */ 594 PROC_LOCK_ASSERT(p, MA_OWNED); 595 ps = p->p_sigacts; 596 mtx_lock(&ps->ps_mtx); 597 while (SIGNOTEMPTY(ps->ps_sigcatch)) { 598 sig = sig_ffs(&ps->ps_sigcatch); 10/02/03 08:00:55 sys/kern/kern_subr.c 1 1 /* 152 2 * Copyright (c) 1982, 1986, 1991, 1993 153 while (n > 0 && uio->uio_resid) { 3 * The Regents of the University of California. All rights reserved. 154 iov = uio->uio_iov; 4 * (c) UNIX System Laboratories, Inc. 155 cnt = iov->iov_len; 5 * All or some portions of this file are derived from material licensed 156 if (cnt == 0) { 6 * to the University of California by American Telephone and Telegraph 157 uio->uio_iov++; 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 158 uio->uio_iovcnt--; 8 * the permission of UNIX System Laboratories, Inc. 159 continue; 9 * 160 } 10 * Redistribution and use in source and binary forms, with or without 161 if (cnt > n) 11 * modification, are permitted provided that the following conditions 162 cnt = n; 12 * are met: 163 13 * 1. Redistributions of source code must retain the above copyright 164 switch (uio->uio_segflg) { 14 * notice, this list of conditions and the following disclaimer. 165 15 * 2. Redistributions in binary form must reproduce the above copyright 166 case UIO_USERSPACE: 16 * notice, this list of conditions and the following disclaimer in the 167 if (ticks - PCPU_GET(switchticks) >= hogticks) 17 * documentation and/or other materials provided with the distribution. 168 uio_yield(); 18 * 3. All advertising materials mentioning features or use of this software 169 if (uio->uio_rw == UIO_READ) 19 * must display the following acknowledgement: 170 error = copyout(cp, iov->iov_base, cnt); 20 * This product includes software developed by the University of 171 else 21 * California, Berkeley and its contributors. 172 error = copyin(iov->iov_base, cp, cnt); 22 * 4. Neither the name of the University nor the names of its contributors 173 if (error) 23 * may be used to endorse or promote products derived from this software 174 goto out; 24 * without specific prior written permission. 175 break; 25 * 176 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 177 case UIO_SYSSPACE: 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 178 if (uio->uio_rw == UIO_READ) 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 179 bcopy(cp, iov->iov_base, cnt); 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 180 else 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 181 bcopy(iov->iov_base, cp, cnt); 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 182 break; 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 183 case UIO_NOCOPY: 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 184 break; 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 185 } 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 186 iov->iov_base = (char *)iov->iov_base + cnt; 36 * SUCH DAMAGE. 187 iov->iov_len -= cnt; 37 * 188 uio->uio_resid -= cnt; 38 * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 189 uio->uio_offset += cnt; 39 */ 190 cp = (char *)cp + cnt; 40 191 n -= cnt; 41 #include 192 } 42 __FBSDID("$FreeBSD: src/sys/kern/kern_subr.c,v 1.77 2003/10/02 15:00:55 nectar 193 out: Exp $"); 194 if (td && save == 0) { 195 mtx_lock_spin(&sched_lock); 196 td->td_flags &= ˜TDF_DEADLKTREAT; 132 int 197 mtx_unlock_spin(&sched_lock); 133 uiomove(void *cp, int n, struct uio *uio) 198 } 134 { 199 return (error); 135 struct thread *td = curthread; 200 } 136 struct iovec *iov; 137 u_int cnt; 138 int error = 0; 139 int save = 0; 140 141 KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, 142 ("uiomove: mode")); 143 KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 144 ("uiomove proc")); 145 146 if (td) { 147 mtx_lock_spin(&sched_lock); 148 save = td->td_flags & TDF_DEADLKTREAT; 149 td->td_flags |= TDF_DEADLKTREAT; 150 mtx_unlock_spin(&sched_lock); 151 } 11/11/03 14:07:29 sys/kern/kern_thread.c 1 1 /* 1242 } 2 * Copyright (C) 2001 Julian Elischer . 1243 3 * All rights reserved. 1244 cpu_thread_exit(td); /* XXXSMP */ 4 * 1245 5 * Redistribution and use in source and binary forms, with or without 1246 /* 6 * modification, are permitted provided that the following conditions 1247 * The last thread is left attached to the process 7 * are met: 1248 * So that the whole bundle gets recycled. Skip 8 * 1. Redistributions of source code must retain the above copyright 1249 * all this stuff. 9 * notice(s), this list of conditions and the following disclaimer as 1250 */ 10 * the first lines of this file unmodified other than the possible 1251 if (p->p_numthreads > 1) { 11 * addition of one or more copyright notices. 1252 thread_unlink(td); 12 * 2. Redistributions in binary form must reproduce the above copyright 1253 if (p->p_maxthrwaits) 13 * notice(s), this list of conditions and the following disclaimer in the 1254 wakeup(&p->p_numthreads); 14 * documentation and/or other materials provided with the distribution. 1255 /* 15 * 1256 * The test below is NOT true if we are the 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ‘‘AS IS’’ AND ANY 1257 * sole exiting thread. P_STOPPED_SNGL is unset 17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 1258 * in exit1() after it is the only survivor. 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 1259 */ 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY 1260 if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 1261 if (p->p_numthreads == p->p_suspcount) { 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 1262 thread_unsuspend_one(p->p_singlethread); 22 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 1263 } 23 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 1264 } 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 1265 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 1266 /* 26 * DAMAGE. 1267 * Because each upcall structure has an owner thread, 27 */ 1268 * owner thread exits only when process is in exiting 28 1269 * state, so upcall to userland is no longer needed, 29 #include 1270 * deleting upcall structure is safe here. 30 __FBSDID("$FreeBSD: src/sys/kern/kern_thread.c,v 1.162 2003/11/11 22:07:29 jhb 1271 * So when all threads in a group is exited, all upcalls Exp $"); 1272 * in the group should be automatically freed. 1273 */ 1274 if (td->td_upcall) 1210 /* 1275 upcall_remove(td); 1211 * Discard the current thread and exit from its context. 1276 1212 * 1277 sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); 1213 * Because we can’t free a thread while we’re operating under its context, 1278 sched_exit_kse(FIRST_KSE_IN_PROC(p), ke); 1214 * push the current thread into our CPU’s deadthread holder. This means 1279 ke->ke_state = KES_UNQUEUED; 1215 * we needn’t worry about someone else grabbing our context before we 1280 ke->ke_thread = NULL; 1216 * do a cpu_throw(). 1281 /* 1217 */ 1282 * Decide what to do with the KSE attached to this thread. 1218 void 1283 */ 1219 thread_exit(void) 1284 if (ke->ke_flags & KEF_EXIT) { 1220 { 1285 kse_unlink(ke); 1221 struct thread *td; 1286 if (kg->kg_kses == 0) { 1222 struct kse *ke; 1287 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), kg) 1223 struct proc *p; ; 1224 struct ksegrp *kg; 1288 ksegrp_unlink(kg); 1225 1289 } 1226 td = curthread; 1290 } 1227 kg = td->td_ksegrp; 1291 else 1228 p = td->td_proc; 1292 kse_reassign(ke); 1229 ke = td->td_kse; 1293 PROC_UNLOCK(p); 1230 1294 td->td_kse = NULL; 1231 mtx_assert(&sched_lock, MA_OWNED); 1295 td->td_state = TDS_INACTIVE; 1232 KASSERT(p != NULL, ("thread exiting without a process")); 1296 #if 0 1233 KASSERT(ke != NULL, ("thread exiting without a kse")); 1297 td->td_proc = NULL; 1234 KASSERT(kg != NULL, ("thread exiting without a kse group")); 1298 #endif 1235 PROC_LOCK_ASSERT(p, MA_OWNED); 1299 td->td_ksegrp = NULL; 1236 CTR1(KTR_PROC, "thread_exit: thread %p", td); 1300 td->td_last_kse = NULL; 1237 KASSERT(!mtx_owned(&Giant), ("dying thread owns giant")); 1301 PCPU_SET(deadthread, td); 1238 1302 } else { 1239 if (td->td_standin != NULL) { 1303 PROC_UNLOCK(p); 1240 thread_stash(td->td_standin); 1304 } 1241 td->td_standin = NULL; 1305 /* XXX Shouldn’t cpu_throw() here. */ 11/11/03 14:07:29 sys/kern/kern_thread.c 2 1306 mtx_assert(&sched_lock, MA_OWNED); 1307 cpu_throw(td, choosethread()); 1308 panic("I’m a teapot!"); 1309 /* NOTREACHED */ 1310 } 1311 1312 /* 1313 * Do any thread specific cleanups that may be needed in wait() 1314 * called with Giant held, proc and schedlock not held. 1315 */ 1316 void 1317 thread_wait(struct proc *p) 1318 { 1319 struct thread *td; 1320 1321 KASSERT((p->p_numthreads == 1), ("Muliple threads in wait1()")); 1322 KASSERT((p->p_numksegrps == 1), ("Muliple ksegrps in wait1()")); 1323 FOREACH_THREAD_IN_PROC(p, td) { 1324 if (td->td_standin != NULL) { 1325 thread_free(td->td_standin); 1326 td->td_standin = NULL; 1327 } 1328 cpu_thread_clean(td); 1329 } 1330 thread_reap(); /* check for zombie threads etc. */ 1331 } 11/09/03 01:17:24 sys/kern/sys_generic.c 1 1 /* 329 2 * Copyright (c) 1982, 1986, 1989, 1993 330 if ((error = fget_write(td, uap->fd, &fp)) == 0) { 3 * The Regents of the University of California. All rights reserved. 331 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte, 4 * (c) UNIX System Laboratories, Inc. 332 (off_t)-1, 0); 5 * All or some portions of this file are derived from material licensed 333 fdrop(fp, td); 6 * to the University of California by American Telephone and Telegraph 334 } else { 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 335 error = EBADF; /* XXX this can’t be right */ 8 * the permission of UNIX System Laboratories, Inc. 336 } 9 * 337 return(error); 10 * Redistribution and use in source and binary forms, with or without 338 } 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 377 static int 14 * notice, this list of conditions and the following disclaimer. 378 dofilewrite(td, fp, fd, buf, nbyte, offset, flags) 15 * 2. Redistributions in binary form must reproduce the above copyright 379 struct thread *td; 16 * notice, this list of conditions and the following disclaimer in the 380 struct file *fp; 17 * documentation and/or other materials provided with the distribution. 381 int fd, flags; 18 * 3. All advertising materials mentioning features or use of this software 382 const void *buf; 19 * must display the following acknowledgement: 383 size_t nbyte; 20 * This product includes software developed by the University of 384 off_t offset; 21 * California, Berkeley and its contributors. 385 { 22 * 4. Neither the name of the University nor the names of its contributors 386 struct uio auio; 23 * may be used to endorse or promote products derived from this software 387 struct iovec aiov; 24 * without specific prior written permission. 388 long cnt, error = 0; 25 * 389 #ifdef KTRACE 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 390 struct iovec ktriov; 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 391 struct uio ktruio; 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 392 int didktr = 0; 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 393 #endif 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 394 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 395 aiov.iov_base = (void *)(uintptr_t)buf; 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 396 aiov.iov_len = nbyte; 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 397 auio.uio_iov = &aiov; 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 398 auio.uio_iovcnt = 1; 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 399 auio.uio_offset = offset; 36 * SUCH DAMAGE. 400 if (nbyte > INT_MAX) 37 * 401 return (EINVAL); 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 402 auio.uio_resid = nbyte; 39 */ 403 auio.uio_rw = UIO_WRITE; 40 404 auio.uio_segflg = UIO_USERSPACE; 41 #include 405 auio.uio_td = td; 42 __FBSDID("$FreeBSD: src/sys/kern/sys_generic.c,v 1.126 2003/11/09 09:17:24 tan 406 #ifdef KTRACE imura Exp $"); 407 /* 408 * if tracing, save a copy of iovec and uio 409 */ 309 /* 410 if (KTRPOINT(td, KTR_GENIO)) { 310 * Write system call 411 ktriov = aiov; 311 */ 412 ktruio = auio; 312 #ifndef _SYS_SYSPROTO_H_ 413 didktr = 1; 313 struct write_args { 414 } 314 int fd; 415 #endif 315 const void *buf; 416 cnt = nbyte; 316 size_t nbyte; 417 if (fp->f_type == DTYPE_VNODE) 317 }; 418 bwillwrite(); 318 #endif 419 if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) { 319 /* 420 if (auio.uio_resid != cnt && (error == ERESTART || 320 * MPSAFE 421 error == EINTR || error == EWOULDBLOCK)) 321 */ 422 error = 0; 322 int 423 /* Socket layer is responsible for issuing SIGPIPE. */ 323 write(td, uap) 424 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { 324 struct thread *td; 425 PROC_LOCK(td->td_proc); 325 struct write_args *uap; 426 psignal(td->td_proc, SIGPIPE); 326 { 427 PROC_UNLOCK(td->td_proc); 327 struct file *fp; 428 } 328 int error; 429 } 11/09/03 01:17:24 sys/kern/sys_generic.c 2 430 cnt -= auio.uio_resid; 431 #ifdef KTRACE 432 if (didktr && error == 0) { 433 ktruio.uio_iov = &ktriov; 434 ktruio.uio_resid = cnt; 435 ktrgenio(fd, UIO_WRITE, &ktruio, error); 436 } 437 #endif 438 td->td_retval[0] = cnt; 439 return (error); 440 } 11/15/03 01:28:09 sys/kern/vfs_bio.c 1 1 /* 65 "buf_ops_bio", 2 * Copyright (c) 1994,1997 John S. Dyson 66 bwrite 3 * All rights reserved. 67 }; 4 * 68 5 * Redistribution and use in source and binary forms, with or without 69 /* 6 * modification, are permitted provided that the following conditions 70 * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has 7 * are met: 71 * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. 8 * 1. Redistributions of source code must retain the above copyright 72 */ 9 * notice immediately at the beginning of the file, without modification, 73 struct buf *buf; /* buffer header pool */ 10 * this list of conditions, and the following disclaimer. 74 11 * 2. Absolutely no warranty of function or purpose is made by the author 75 static struct proc *bufdaemonproc; 12 * John S. Dyson. 76 13 */ 77 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, 14 78 vm_offset_t to); 15 /* 79 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, 16 * this file contains a new buffer I/O scheme implementing a coherent 80 vm_offset_t to); 17 * VM object and buffer cache scheme. Pains have been taken to make 81 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, 18 * sure that the performance degradation associated with schemes such 82 int pageno, vm_page_t m); 19 * as this is not realized. 83 static void vfs_clean_pages(struct buf * bp); 20 * 84 static void vfs_setdirty(struct buf *bp); 21 * Author: John S. Dyson 85 static void vfs_vmio_release(struct buf *bp); 22 * Significant help during the development and debugging phases 86 static void vfs_backgroundwritedone(struct buf *bp); 23 * had been provided by David Greenman, also of the FreeBSD core team. 87 static int vfs_bio_clcheck(struct vnode *vp, int size, 24 * 88 daddr_t lblkno, daddr_t blkno); 25 * see man buf(9) for more info. 89 static int flushbufqueues(int flushdeps); 26 */ 90 static void buf_daemon(void); 27 91 void bremfreel(struct buf * bp); 28 #include 92 29 __FBSDID("$FreeBSD: src/sys/kern/vfs_bio.c,v 1.425 2003/11/15 09:28:09 phk Exp 93 int vmiodirenable = TRUE; $"); 94 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, 30 95 "Use the VM system for directory writes"); 31 #include 96 int runningbufspace; 32 #include 97 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 33 #include 98 "Amount of presently outstanding async buffer io"); 34 #include 99 static int bufspace; 35 #include 100 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 36 #include 101 "KVA memory used for bufs"); 37 #include 102 static int maxbufspace; 38 #include 103 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 39 #include 104 "Maximum allowed value of bufspace (including buf_daemon)"); 40 #include 105 static int bufmallocspace; 41 #include 106 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 42 #include 107 "Amount of malloced memory for buffers"); 43 #include 108 static int maxbufmallocspace; 44 #include 109 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 45 #include 0, 46 #include 110 "Maximum amount of malloced memory for buffers"); 47 #include 111 static int lobufspace; 48 #include 112 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 49 #include 113 "Minimum amount of buffers we want to have"); 50 #include 114 static int hibufspace; 51 #include 115 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 52 #include 116 "Maximum allowed value of bufspace (excluding buf_daemon)"); 53 #include 117 static int bufreusecnt; 54 #include 118 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, 55 #include 119 "Number of times we have reused a buffer"); 56 #include 120 static int buffreekvacnt; 57 #include "opt_directio.h" 121 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, 58 #include "opt_swap.h" 122 "Number of times we have freed the KVA space from some buffer"); 59 123 static int bufdefragcnt; 60 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 124 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, 61 125 "Number of times we have had to repeat buffer allocation to defragment"); 62 struct bio_ops bioops; /* I/O operation notification */ 126 static int lorunningspace; 63 127 SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, 64 struct buf_ops buf_ops_bio = { 128 "Minimum preferred space used for in-progress I/O"); 11/15/03 01:28:09 sys/kern/vfs_bio.c 2 129 static int hirunningspace; 192 130 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, 193 /* 131 "Maximum amount of space to use for in-progress I/O"); 194 * Synchronization (sleep/wakeup) variable for active buffer space requests. 132 static int dirtybufferflushes; 195 * Set when wait starts, cleared prior to wakeup(). 133 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes 196 * Used in runningbufwakeup() and waitrunningbufspace(). , 197 */ 134 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); 198 static int runningbufreq; 135 static int altbufferflushes; 199 136 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 200 /* 137 0, "Number of fsync flushes to limit dirty buffers"); 201 * This lock protects the runningbufreq and synchronizes runningbufwakeup and 138 static int recursiveflushes; 202 * waitrunningbufspace(). 139 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 203 */ 140 0, "Number of flushes skipped due to being recursive"); 204 static struct mtx rbreqlock; 141 static int numdirtybuffers; 205 142 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, 206 /* 143 "Number of buffers that are dirty (has unwritten changes) at the moment"); 207 * Synchronization (sleep/wakeup) variable for buffer requests. 144 static int lodirtybuffers; 208 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done 145 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, 209 * by and/or. 146 "How many buffers we want to have free before bufdaemon can sleep"); 210 * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), 147 static int hidirtybuffers; 211 * getnewbuf(), and getblk(). 148 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, 212 */ 149 "When the number of dirty buffers is considered severe"); 213 static int needsbuffer; 150 static int dirtybufthresh; 214 151 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 215 /* 152 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); 216 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. 153 static int numfreebuffers; 217 */ 154 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, 218 static struct mtx nblock; 155 "Number of free buffers"); 219 156 static int lofreebuffers; 220 /* 157 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, 221 * Lock that protects against bwait()/bdone()/B_DONE races. 158 "XXX Unused"); 222 */ 159 static int hifreebuffers; 223 160 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, 224 static struct mtx bdonelock; 161 "XXX Complicatedly unused"); 225 162 static int getnewbufcalls; 226 /* 163 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, 227 * Definitions for the buffer free lists. 164 "Number of calls to getnewbuf"); 228 */ 165 static int getnewbufrestarts; 229 #define BUFFER_QUEUES 5 /* number of free buffer queues */ 166 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 230 0, 231 #define QUEUE_NONE 0 /* on no queue */ 167 "Number of times getnewbuf has had to restart a buffer aquisition"); 232 #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ 168 static int dobkgrdwrite = 1; 233 #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ 169 SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, 234 #define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */ 170 "Do background writes (honoring the BV_BKGRDWRITE flag)?"); 235 #define QUEUE_EMPTY 4 /* empty buffer headers */ 171 236 172 /* 237 /* Queues for free buffers with various properties */ 173 * Wakeup point for bufdaemon, as well as indicator of whether it is already 238 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; 174 * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it 239 175 * is idling. 240 /* Lock for the bufqueues */ 176 */ 241 static struct mtx bqlock; 177 static int bd_request; 242 178 243 /* 179 /* 244 * Single global constant for BUF_WMESG, to avoid getting multiple references. 180 * This lock synchronizes access to bd_request. 245 * buf_wmesg is referred from macros. 181 */ 246 */ 182 static struct mtx bdlock; 247 const char *buf_wmesg = BUF_WMESG; 183 248 184 /* 249 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 185 * bogus page -- for I/O to/from partially complete buffers 250 #define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ 186 * this is a temporary solution to the problem, but it is not 251 #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis * 187 * really that bad. it would be better to split the buffer / 188 * for input in the case of buffers partially already in memory, 252 #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis * 189 * but the code is intricate enough already. / 190 */ 253 191 vm_page_t bogus_page; 254 #ifdef DIRECTIO 11/15/03 01:28:09 sys/kern/vfs_bio.c 3 255 extern void ffs_rawread_setup(void); 320 256 #endif /* DIRECTIO */ 321 /* 257 /* 322 * bufcountwakeup: 258 * numdirtywakeup: 323 * 259 * 324 * Called when a buffer has been added to one of the free queues to 260 * If someone is blocked due to there being too many dirty buffers, 325 * account for the buffer and to wakeup anyone waiting for free buffers. 261 * and numdirtybuffers is now reasonable, wake them up. 326 * This typically occurs when large amounts of metadata are being handled 262 */ 327 * by the buffer cache ( else buffer space runs out first, usually ). 263 328 */ 264 static __inline void 329 265 numdirtywakeup(int level) 330 static __inline void 266 { 331 bufcountwakeup(void) 267 if (numdirtybuffers <= level) { 332 { 268 mtx_lock(&nblock); 333 atomic_add_int(&numfreebuffers, 1); 269 if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { 334 mtx_lock(&nblock); 270 needsbuffer &= ˜VFS_BIO_NEED_DIRTYFLUSH; 335 if (needsbuffer) { 271 wakeup(&needsbuffer); 336 needsbuffer &= ˜VFS_BIO_NEED_ANY; 272 } 337 if (numfreebuffers >= hifreebuffers) 273 mtx_unlock(&nblock); 338 needsbuffer &= ˜VFS_BIO_NEED_FREE; 274 } 339 wakeup(&needsbuffer); 275 } 340 } 276 341 mtx_unlock(&nblock); 277 /* 342 } 278 * bufspacewakeup: 343 279 * 344 /* 280 * Called when buffer space is potentially available for recovery. 345 * waitrunningbufspace() 281 * getnewbuf() will block on this flag when it is unable to free 346 * 282 * sufficient buffer space. Buffer space becomes recoverable when 347 * runningbufspace is a measure of the amount of I/O currently 283 * bp’s get placed back in the queues. 348 * running. This routine is used in async-write situations to 284 */ 349 * prevent creating huge backups of pending writes to a device. 285 350 * Only asynchronous writes are governed by this function. 286 static __inline void 351 * 287 bufspacewakeup(void) 352 * Reads will adjust runningbufspace, but will not block based on it. 288 { 353 * The read load has a side effect of reducing the allowed write load. 289 /* 354 * 290 * If someone is waiting for BUF space, wake them up. Even 355 * This does NOT turn an async write into a sync write. It waits 291 * though we haven’t freed the kva space yet, the waiting 356 * for earlier writes to complete and generally returns before the 292 * process will be able to now. 357 * caller’s write has reached the device. 293 */ 358 */ 294 mtx_lock(&nblock); 359 static __inline void 295 if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 360 waitrunningbufspace(void) 296 needsbuffer &= ˜VFS_BIO_NEED_BUFSPACE; 361 { 297 wakeup(&needsbuffer); 362 mtx_lock(&rbreqlock); 298 } 363 while (runningbufspace > hirunningspace) { 299 mtx_unlock(&nblock); 364 ++runningbufreq; 300 } 365 msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); 301 366 } 302 /* 367 mtx_unlock(&rbreqlock); 303 * runningbufwakeup() - in-progress I/O accounting. 368 } 304 * 369 305 */ 370 306 static __inline void 371 /* 307 runningbufwakeup(struct buf *bp) 372 * vfs_buf_test_cache: 308 { 373 * 309 if (bp->b_runningbufspace) { 374 * Called when a buffer is extended. This function clears the B_CACHE 310 atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); 375 * bit if the newly extended portion of the buffer does not contain 311 bp->b_runningbufspace = 0; 376 * valid data. 312 mtx_lock(&rbreqlock); 377 */ 313 if (runningbufreq && runningbufspace <= lorunningspace) { 378 static __inline__ 314 runningbufreq = 0; 379 void 315 wakeup(&runningbufreq); 380 vfs_buf_test_cache(struct buf *bp, 316 } 381 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 317 mtx_unlock(&rbreqlock); 382 vm_page_t m) 318 } 383 { 319 } 384 GIANT_REQUIRED; 11/15/03 01:28:09 sys/kern/vfs_bio.c 4 385 450 if (physmem_est > 65536) 386 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 451 nbuf += (physmem_est - 65536) * 2 / (factor * 5); 387 if (bp->b_flags & B_CACHE) { 452 388 int base = (foff + off) & PAGE_MASK; 453 if (maxbcache && nbuf > maxbcache / BKVASIZE) 389 if (vm_page_is_valid(m, base, size) == 0) 454 nbuf = maxbcache / BKVASIZE; 390 bp->b_flags &= ˜B_CACHE; 455 } 391 } 456 392 } 457 #if 0 393 458 /* 394 /* Wake up the buffer deamon if necessary */ 459 * Do not allow the buffer_map to be more then 1/2 the size of the 395 static __inline__ 460 * kernel_map. 396 void 461 */ 397 bd_wakeup(int dirtybuflevel) 462 if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / 398 { 463 (BKVASIZE * 2)) { 399 mtx_lock(&bdlock); 464 nbuf = (kernel_map->max_offset - kernel_map->min_offset) / 400 if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { 465 (BKVASIZE * 2); 401 bd_request = 1; 466 printf("Warning: nbufs capped at %d\n", nbuf); 402 wakeup(&bd_request); 467 } 403 } 468 #endif 404 mtx_unlock(&bdlock); 469 405 } 470 /* 406 471 * swbufs are used as temporary holders for I/O, such as paging I/O. 407 /* 472 * We have no less then 16 and no more then 256. 408 * bd_speedup - speedup the buffer cache flushing code 473 */ 409 */ 474 nswbuf = max(min(nbuf/4, 256), 16); 410 475 #ifdef NSWBUF_MIN 411 static __inline__ 476 if (nswbuf < NSWBUF_MIN) 412 void 477 nswbuf = NSWBUF_MIN; 413 bd_speedup(void) 478 #endif 414 { 479 #ifdef DIRECTIO 415 bd_wakeup(1); 480 ffs_rawread_setup(); 416 } 481 #endif 417 482 418 /* 483 /* 419 * Calculating buffer cache scaling values and reserve space for buffer 484 * Reserve space for the buffer cache buffers 420 * headers. This is called during low level kernel initialization and 485 */ 421 * may be called more then once. We CANNOT write to the memory area 486 swbuf = (void *)v; 422 * being reserved at this time. 487 v = (caddr_t)(swbuf + nswbuf); 423 */ 488 buf = (void *)v; 424 caddr_t 489 v = (caddr_t)(buf + nbuf); 425 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) 490 426 { 491 return(v); 427 /* 492 } 428 * physmem_est is in pages. Convert it to kilobytes (assumes 493 429 * PAGE_SIZE is >= 1K) 494 /* Initialize the buffer subsystem. Called before use of any buffers. */ 430 */ 495 void 431 physmem_est = physmem_est * (PAGE_SIZE / 1024); 496 bufinit(void) 432 497 { 433 /* 498 struct buf *bp; 434 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 499 int i; 435 * For the first 64MB of ram nominally allocate sufficient buffers to 500 436 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 501 GIANT_REQUIRED; 437 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 502 438 * the buffer cache we limit the eventual kva reservation to 503 mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); 439 * maxbcache bytes. 504 mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); 440 * 505 mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); 441 * factor represents the 1/4 x ram conversion. 506 mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); 442 */ 507 mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF); 443 if (nbuf == 0) { 508 444 int factor = 4 * BKVASIZE / 1024; 509 /* next, make a null set of free lists */ 445 510 for (i = 0; i < BUFFER_QUEUES; i++) 446 nbuf = 50; 511 TAILQ_INIT(&bufqueues[i]); 447 if (physmem_est > 4096) 512 448 nbuf += min((physmem_est - 4096) / factor, 513 /* finally, initialize each buffer header and stick on empty q */ 449 65536 / factor); 514 for (i = 0; i < nbuf; i++) { 11/15/03 01:28:09 sys/kern/vfs_bio.c 5 515 bp = &buf[i]; 580 lofreebuffers = nbuf / 18 + 5; 516 bzero(bp, sizeof *bp); 581 hifreebuffers = 2 * lofreebuffers; 517 bp->b_flags = B_INVAL; /* we’re just an empty header */ 582 numfreebuffers = nbuf; 518 bp->b_dev = NODEV; 583 519 bp->b_rcred = NOCRED; 584 /* 520 bp->b_wcred = NOCRED; 585 * Maximum number of async ops initiated per buf_daemon loop. This is 521 bp->b_qindex = QUEUE_EMPTY; 586 * somewhat of a hack at the moment, we really need to limit ourselves 522 bp->b_vflags = 0; 587 * based on the number of bytes of I/O in-transit that were initiated 523 bp->b_xflags = 0; 588 * from buf_daemon. 524 LIST_INIT(&bp->b_dep); 589 */ 525 BUF_LOCKINIT(bp); 590 526 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 591 bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | 527 } 592 VM_ALLOC_NORMAL | VM_ALLOC_WIRED); 528 593 } 529 /* 594 530 * maxbufspace is the absolute maximum amount of buffer space we are 595 /* 531 * allowed to reserve in KVM and in real terms. The absolute maximum 596 * bfreekva() - free the kva allocation for a buffer. 532 * is nominally used by buf_daemon. hibufspace is the nominal maximum 597 * 533 * used by most other processes. The differential is required to 598 * Must be called at splbio() or higher as this is the only locking for 534 * ensure that buf_daemon is able to run when other processes might 599 * buffer_map. 535 * be blocked waiting for buffer space. 600 * 536 * 601 * Since this call frees up buffer space, we call bufspacewakeup(). 537 * maxbufspace is based on BKVASIZE. Allocating buffers larger then 602 */ 538 * this may result in KVM fragmentation which is not handled optimally 603 static void 539 * by the system. 604 bfreekva(struct buf * bp) 540 */ 605 { 541 maxbufspace = nbuf * BKVASIZE; 606 GIANT_REQUIRED; 542 hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); 607 543 lobufspace = hibufspace - MAXBSIZE; 608 if (bp->b_kvasize) { 544 609 atomic_add_int(&buffreekvacnt, 1); 545 lorunningspace = 512 * 1024; 610 atomic_subtract_int(&bufspace, bp->b_kvasize); 546 hirunningspace = 1024 * 1024; 611 vm_map_delete(buffer_map, 547 612 (vm_offset_t) bp->b_kvabase, 548 /* 613 (vm_offset_t) bp->b_kvabase + bp->b_kvasize 549 * Limit the amount of malloc memory since it is wired permanently into 614 ); 550 * the kernel space. Even though this is accounted for in the buffer 615 bp->b_kvasize = 0; 551 * allocation, we don’t want the malloced region to grow uncontrolled. 616 bufspacewakeup(); 552 * The malloc scheme improves memory utilization significantly on average 617 } 553 * (small) directories. 618 } 554 */ 619 555 maxbufmallocspace = hibufspace / 20; 620 /* 556 621 * bremfree: 557 /* 622 * 558 * Reduce the chance of a deadlock occuring by limiting the number 623 * Remove the buffer from the appropriate free list. 559 * of delayed-write dirty buffers we allow to stack up. 624 */ 560 */ 625 void 561 hidirtybuffers = nbuf / 4 + 20; 626 bremfree(struct buf * bp) 562 dirtybufthresh = hidirtybuffers * 9 / 10; 627 { 563 numdirtybuffers = 0; 628 mtx_lock(&bqlock); 564 /* 629 bremfreel(bp); 565 * To support extreme low-memory systems, make sure hidirtybuffers cannot 630 mtx_unlock(&bqlock); 566 * eat up all available buffer space. This occurs when our minimum cannot 631 } 567 * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming 632 568 * BKVASIZE’d (8K) buffers. 633 void 569 */ 634 bremfreel(struct buf * bp) 570 while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { 635 { 571 hidirtybuffers >>= 1; 636 int s = splbio(); 572 } 637 int old_qindex = bp->b_qindex; 573 lodirtybuffers = hidirtybuffers / 2; 638 574 639 GIANT_REQUIRED; 575 /* 640 576 * Try to keep the number of free buffers in the specified range, 641 if (bp->b_qindex != QUEUE_NONE) { 577 * and give special processes (e.g. like buf_daemon) access to an 642 KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp) 578 * emergency reserve. ); 579 */ 643 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 11/15/03 01:28:09 sys/kern/vfs_bio.c 6 644 bp->b_qindex = QUEUE_NONE; 709 if (bp->b_rcred == NOCRED && cred != NOCRED) 645 } else { 710 bp->b_rcred = crhold(cred); 646 if (BUF_REFCNT(bp) <= 1) 711 vfs_busy_pages(bp, 0); 647 panic("bremfree: removing a buffer not on a queue"); 712 bp->b_iooffset = dbtob(bp->b_blkno); 648 } 713 if (vp->v_type == VCHR) 649 714 VOP_SPECSTRATEGY(vp, bp); 650 /* 715 else 651 * Fixup numfreebuffers count. If the buffer is invalid or not 716 VOP_STRATEGY(vp, bp); 652 * delayed-write, and it was on the EMPTY, LRU, or AGE queues, 717 ++readwait; 653 * the buffer was free and we must decrement numfreebuffers. 718 } 654 */ 719 655 if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { 720 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 656 switch(old_qindex) { 721 if (inmem(vp, *rablkno)) 657 case QUEUE_DIRTY: 722 continue; 658 case QUEUE_CLEAN: 723 rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); 659 case QUEUE_EMPTY: 724 660 case QUEUE_EMPTYKVA: 725 if ((rabp->b_flags & B_CACHE) == 0) { 661 atomic_subtract_int(&numfreebuffers, 1); 726 if (curthread != PCPU_GET(idlethread)) 662 break; 727 curthread->td_proc->p_stats->p_ru.ru_inblock++ 663 default: ; 664 break; 728 rabp->b_flags |= B_ASYNC; 665 } 729 rabp->b_flags &= ˜B_INVAL; 666 } 730 rabp->b_ioflags &= ˜BIO_ERROR; 667 splx(s); 731 rabp->b_iocmd = BIO_READ; 668 } 732 if (rabp->b_rcred == NOCRED && cred != NOCRED) 669 733 rabp->b_rcred = crhold(cred); 670 734 vfs_busy_pages(rabp, 0); 671 /* 735 BUF_KERNPROC(rabp); 672 * Get a buffer with the specified data. Look in the cache first. We 736 rabp->b_iooffset = dbtob(rabp->b_blkno); 673 * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 737 if (vp->v_type == VCHR) 674 * is set, the buffer is valid and we do not have to do anything ( see 738 VOP_SPECSTRATEGY(vp, rabp); 675 * getblk() ). This is really just a special case of breadn(). 739 else 676 */ 740 VOP_STRATEGY(vp, rabp); 677 int 741 } else { 678 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 742 brelse(rabp); 679 struct buf ** bpp) 743 } 680 { 744 } 681 745 682 return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp)); 746 if (readwait) { 683 } 747 rv = bufwait(bp); 684 748 } 685 /* 749 return (rv); 686 * Operates like bread, but also starts asynchronous I/O on 750 } 687 * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior 751 688 * to initiating I/O . If B_CACHE is set, the buffer is valid 752 /* 689 * and we do not have to do anything. 753 * Write, release buffer on completion. (Done by iodone 690 */ 754 * if async). Do not bother writing anything if the buffer 691 int 755 * is invalid. 692 breadn(struct vnode * vp, daddr_t blkno, int size, 756 * 693 daddr_t * rablkno, int *rabsize, 757 * Note that we set B_CACHE here, indicating that buffer is 694 int cnt, struct ucred * cred, struct buf ** bpp) 758 * fully valid and thus cacheable. This is true even of NFS 695 { 759 * now so we set it generally. This could be set either here 696 struct buf *bp, *rabp; 760 * or in biodone() since the I/O is synchronous. We put it 697 int i; 761 * here. 698 int rv = 0, readwait = 0; 762 */ 699 763 700 *bpp = bp = getblk(vp, blkno, size, 0, 0, 0); 764 int 701 765 bwrite(struct buf * bp) 702 /* if not found in cache, do some I/O */ 766 { 703 if ((bp->b_flags & B_CACHE) == 0) { 767 int oldflags, s; 704 if (curthread != PCPU_GET(idlethread)) 768 struct buf *newbp; 705 curthread->td_proc->p_stats->p_ru.ru_inblock++; 769 706 bp->b_iocmd = BIO_READ; 770 if (bp->b_flags & B_INVAL) { 707 bp->b_flags &= ˜B_INVAL; 771 brelse(bp); 708 bp->b_ioflags &= ˜BIO_ERROR; 772 return (0); 11/15/03 01:28:09 sys/kern/vfs_bio.c 7 773 } 838 newbp->b_flags |= B_ASYNC; 774 839 newbp->b_flags &= ˜B_INVAL; 775 oldflags = bp->b_flags; 840 776 841 /* move over the dependencies */ 777 if (BUF_REFCNT(bp) == 0) 842 if (LIST_FIRST(&bp->b_dep) != NULL) 778 panic("bwrite: buffer is not busy???"); 843 buf_movedeps(bp, newbp); 779 s = splbio(); 844 780 /* 845 /* 781 * If a background write is already in progress, delay 846 * Initiate write on the copy, release the original to 782 * writing this block if it is asynchronous. Otherwise 847 * the B_LOCKED queue so that it cannot go away until 783 * wait for the background write to complete. 848 * the background write completes. If not locked it could go 784 */ 849 * away and then be reconstituted while it was being written. 785 VI_LOCK(bp->b_vp); 850 * If the reconstituted buffer were written, we could end up 786 if (bp->b_vflags & BV_BKGRDINPROG) { 851 * with two background copies being written at the same time. 787 if (bp->b_flags & B_ASYNC) { 852 */ 788 VI_UNLOCK(bp->b_vp); 853 bqrelse(bp); 789 splx(s); 854 bp = newbp; 790 bdwrite(bp); 855 } 791 return (0); 856 792 } 857 bp->b_flags &= ˜B_DONE; 793 bp->b_vflags |= BV_BKGRDWAIT; 858 bp->b_ioflags &= ˜BIO_ERROR; 794 msleep(&bp->b_xflags, VI_MTX(bp->b_vp), PRIBIO, "bwrbg", 0); 859 bp->b_flags |= B_WRITEINPROG | B_CACHE; 795 if (bp->b_vflags & BV_BKGRDINPROG) 860 bp->b_iocmd = BIO_WRITE; 796 panic("bwrite: still writing"); 861 797 } 862 VI_LOCK(bp->b_vp); 798 VI_UNLOCK(bp->b_vp); 863 bp->b_vp->v_numoutput++; 799 864 VI_UNLOCK(bp->b_vp); 800 /* Mark the buffer clean */ 865 vfs_busy_pages(bp, 1); 801 bundirty(bp); 866 802 867 /* 803 /* 868 * Normal bwrites pipeline writes 804 * If this buffer is marked for background writing and we 869 */ 805 * do not have to wait for it, make a copy and write the 870 bp->b_runningbufspace = bp->b_bufsize; 806 * copy so as to leave this buffer ready for further use. 871 atomic_add_int(&runningbufspace, bp->b_runningbufspace); 807 * 872 808 * This optimization eats a lot of memory. If we have a page 873 if (curthread != PCPU_GET(idlethread)) 809 * or buffer shortfall we can’t do it. 874 curthread->td_proc->p_stats->p_ru.ru_oublock++; 810 */ 875 splx(s); 811 if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && 876 if (oldflags & B_ASYNC) 812 (bp->b_flags & B_ASYNC) && 877 BUF_KERNPROC(bp); 813 !vm_page_count_severe() && 878 bp->b_iooffset = dbtob(bp->b_blkno); 814 !buf_dirty_count_severe()) { 879 if (bp->b_vp->v_type == VCHR) 815 if (bp->b_iodone != NULL) { 880 VOP_SPECSTRATEGY(bp->b_vp, bp); 816 printf("bp->b_iodone = %p\n", bp->b_iodone); 881 else 817 panic("bwrite: need chained iodone"); 882 VOP_STRATEGY(bp->b_vp, bp); 818 } 883 819 884 if ((oldflags & B_ASYNC) == 0) { 820 /* get a new block */ 885 int rtval = bufwait(bp); 821 newbp = geteblk(bp->b_bufsize); 886 brelse(bp); 822 887 return (rtval); 823 /* 888 } else { 824 * set it to be identical to the old block. We have to 889 /* 825 * set b_lblkno and BKGRDMARKER before calling bgetvp() 890 * don’t allow the async write to saturate the I/O 826 * to avoid confusing the splay tree and gbincore(). 891 * system. We will not deadlock here because 827 */ 892 * we are blocking waiting for I/O that is already in-progress 828 memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); 893 * to complete. We do not block here if it is the update 829 newbp->b_lblkno = bp->b_lblkno; 894 * or syncer daemon trying to clean up as that can lead 830 newbp->b_xflags |= BX_BKGRDMARKER; 895 * to deadlock. 831 VI_LOCK(bp->b_vp); 896 */ 832 bp->b_vflags |= BV_BKGRDINPROG; 897 if (curthread->td_proc != bufdaemonproc && 833 bgetvp(bp->b_vp, newbp); 898 curthread->td_proc != updateproc) 834 VI_UNLOCK(bp->b_vp); 899 waitrunningbufspace(); 835 newbp->b_blkno = bp->b_blkno; 900 } 836 newbp->b_offset = bp->b_offset; 901 837 newbp->b_iodone = vfs_backgroundwritedone; 902 return (0); 11/15/03 01:28:09 sys/kern/vfs_bio.c 8 903 } 968 struct vnode *vp; 904 969 struct buf *nbp; 905 /* 970 906 * Complete a background write started from bwrite. 971 GIANT_REQUIRED; 907 */ 972 908 static void 973 if (BUF_REFCNT(bp) == 0) 909 vfs_backgroundwritedone(bp) 974 panic("bdwrite: buffer is not busy"); 910 struct buf *bp; 975 911 { 976 if (bp->b_flags & B_INVAL) { 912 struct buf *origbp; 977 brelse(bp); 913 978 return; 914 /* 979 } 915 * Find the original buffer that we are writing. 980 916 */ 981 /* 917 VI_LOCK(bp->b_vp); 982 * If we have too many dirty buffers, don’t create any more. 918 if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL) 983 * If we are wildly over our limit, then force a complete 919 panic("backgroundwritedone: lost buffer"); 984 * cleanup. Otherwise, just keep the situation from getting 920 985 * out of control. Note that we have to avoid a recursive 921 /* 986 * disaster and not try to clean up after our own cleanup! 922 * Clear the BV_BKGRDINPROG flag in the original buffer 987 */ 923 * and awaken it if it is waiting for the write to complete. 988 vp = bp->b_vp; 924 * If BV_BKGRDINPROG is not set in the original buffer it must 989 VI_LOCK(vp); 925 * have been released and re-instantiated - which is not legal. 990 if (td->td_pflags & TDP_COWINPROGRESS) { 926 */ 991 recursiveflushes++; 927 KASSERT((origbp->b_vflags & BV_BKGRDINPROG), 992 } else if (vp != NULL && vp->v_dirtybufcnt > dirtybufthresh + 10) { 928 ("backgroundwritedone: lost buffer2")); 993 VI_UNLOCK(vp); 929 origbp->b_vflags &= ˜BV_BKGRDINPROG; 994 (void) VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td); 930 if (origbp->b_vflags & BV_BKGRDWAIT) { 995 VI_LOCK(vp); 931 origbp->b_vflags &= ˜BV_BKGRDWAIT; 996 altbufferflushes++; 932 wakeup(&origbp->b_xflags); 997 } else if (vp != NULL && vp->v_dirtybufcnt > dirtybufthresh) { 933 } 998 /* 934 VI_UNLOCK(bp->b_vp); 999 * Try to find a buffer to flush. 935 /* 1000 */ 936 * Process dependencies then return any unfinished ones. 1001 TAILQ_FOREACH(nbp, &vp->v_dirtyblkhd, b_vnbufs) { 937 */ 1002 if ((nbp->b_vflags & BV_BKGRDINPROG) || 938 if (LIST_FIRST(&bp->b_dep) != NULL) 1003 buf_countdeps(nbp, 0) || 939 buf_complete(bp); 1004 BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 940 if (LIST_FIRST(&bp->b_dep) != NULL) 1005 continue; 941 buf_movedeps(bp, origbp); 1006 if (bp == nbp) 942 1007 panic("bdwrite: found ourselves"); 943 /* 1008 VI_UNLOCK(vp); 944 * This buffer is marked B_NOCACHE, so when it is released 1009 if (nbp->b_flags & B_CLUSTEROK) { 945 * by biodone, it will be tossed. We mark it with BIO_READ 1010 vfs_bio_awrite(nbp); 946 * to avoid biodone doing a second vwakeup. 1011 } else { 947 */ 1012 bremfree(nbp); 948 bp->b_flags |= B_NOCACHE; 1013 bawrite(nbp); 949 bp->b_iocmd = BIO_READ; 1014 } 950 bp->b_flags &= ˜(B_CACHE | B_DONE); 1015 VI_LOCK(vp); 951 bp->b_iodone = 0; 1016 dirtybufferflushes++; 952 bufdone(bp); 1017 break; 953 } 1018 } 954 1019 } 955 /* 1020 VI_UNLOCK(vp); 956 * Delayed write. (Buffer is marked dirty). Do not bother writing 1021 957 * anything if the buffer is marked invalid. 1022 bdirty(bp); 958 * 1023 /* 959 * Note that since the buffer must be completely valid, we can safely 1024 * Set B_CACHE, indicating that the buffer is fully valid. This is 960 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 1025 * true even of NFS now. 961 * biodone() in order to prevent getblk from writing the buffer 1026 */ 962 * out synchronously. 1027 bp->b_flags |= B_CACHE; 963 */ 1028 964 void 1029 /* 965 bdwrite(struct buf * bp) 1030 * This bmap keeps the system from needing to do the bmap later, 966 { 1031 * perhaps when the system is attempting to do a sync. Since it 967 struct thread *td = curthread; 1032 * is likely that the indirect block -- or whatever other datastructur 11/15/03 01:28:09 sys/kern/vfs_bio.c 9 e 1096 bp->b_iocmd = BIO_WRITE; 1033 * that the filesystem needs is still in memory now, it is a good 1097 1034 * thing to do this. Note also, that if the pageout daemon is 1098 if ((bp->b_flags & B_DELWRI) == 0) { 1035 * requesting a sync -- there might not be enough memory to do 1099 bp->b_flags |= B_DONE | B_DELWRI; 1036 * the bmap then... So, this is important to do. 1100 reassignbuf(bp, bp->b_vp); 1037 */ 1101 atomic_add_int(&numdirtybuffers, 1); 1038 if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { 1102 bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); 1039 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 1103 } 1040 } 1104 } 1041 1105 1042 /* 1106 /* 1043 * Set the *dirty* buffer range based upon the VM system dirty pages. 1107 * bundirty: 1044 */ 1108 * 1045 vfs_setdirty(bp); 1109 * Clear B_DELWRI for buffer. 1046 1110 * 1047 /* 1111 * Since the buffer is not on a queue, we do not update the numfreebuffer 1048 * We need to do this here to satisfy the vnode_pager and the s 1049 * pageout daemon, so that it thinks that the pages have been 1112 * count. 1050 * "cleaned". Note that since the pages are in a delayed write 1113 * 1051 * buffer -- the VFS layer "will" see that the pages get written 1114 * Must be called at splbio(). 1052 * out on the next sync, or perhaps the cluster will be completed. 1115 * The buffer must be on QUEUE_NONE. 1053 */ 1116 */ 1054 vfs_clean_pages(bp); 1117 1055 bqrelse(bp); 1118 void 1056 1119 bundirty(bp) 1057 /* 1120 struct buf *bp; 1058 * Wakeup the buffer flushing daemon if we have a lot of dirty 1121 { 1059 * buffers (midpoint between our recovery point and our stall 1122 KASSERT(bp->b_qindex == QUEUE_NONE, 1060 * point). 1123 ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1061 */ 1124 1062 bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); 1125 if (bp->b_flags & B_DELWRI) { 1063 1126 bp->b_flags &= ˜B_DELWRI; 1064 /* 1127 reassignbuf(bp, bp->b_vp); 1065 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 1128 atomic_subtract_int(&numdirtybuffers, 1); 1066 * due to the softdep code. 1129 numdirtywakeup(lodirtybuffers); 1067 */ 1130 } 1068 } 1131 /* 1069 1132 * Since it is now being written, we can clear its deferred write flag 1070 /* . 1071 * bdirty: 1133 */ 1072 * 1134 bp->b_flags &= ˜B_DEFERRED; 1073 * Turn buffer into delayed write request. We must clear BIO_READ and 1135 } 1074 * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to 1136 1075 * itself to properly update it in the dirty/clean lists. We mark it 1137 /* 1076 * B_DONE to ensure that any asynchronization of the buffer properly 1138 * bawrite: 1077 * clears B_DONE ( else a panic will occur later ). 1139 * 1078 * 1140 * Asynchronous write. Start output on a buffer, but do not wait for 1079 * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which 1141 * it to complete. The buffer is released when the output completes. 1080 * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() 1142 * 1081 * should only be called if the buffer is known-good. 1143 * bwrite() ( or the VOP routine anyway ) is responsible for handling 1082 * 1144 * B_INVAL buffers. Not us. 1083 * Since the buffer is not on a queue, we do not update the numfreebuffer 1145 */ s 1146 void 1084 * count. 1147 bawrite(struct buf * bp) 1085 * 1148 { 1086 * Must be called at splbio(). 1149 bp->b_flags |= B_ASYNC; 1087 * The buffer must be on QUEUE_NONE. 1150 (void) BUF_WRITE(bp); 1088 */ 1151 } 1089 void 1152 1090 bdirty(bp) 1153 /* 1091 struct buf *bp; 1154 * bwillwrite: 1092 { 1155 * 1093 KASSERT(bp->b_qindex == QUEUE_NONE, 1156 * Called prior to the locking of any vnodes when we are expecting to 1094 ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1157 * write. We do not want to starve the buffer cache with too many 1095 bp->b_flags &= ˜(B_RELBUF); 1158 * dirty buffers so we block here. By blocking prior to the locking 11/15/03 01:28:09 sys/kern/vfs_bio.c 10 1159 * of any vnodes we attempt to avoid the situation where a locked vnode 1224 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || 1160 * prevents the various system daemons from flushing related buffers. 1225 (bp->b_ioflags & BIO_ERROR) || 1161 */ 1226 bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) { 1162 1227 /* 1163 void 1228 * Either a failed I/O or we were asked to free or not 1164 bwillwrite(void) 1229 * cache the buffer. 1165 { 1230 */ 1166 if (numdirtybuffers >= hidirtybuffers) { 1231 bp->b_flags |= B_INVAL; 1167 int s; 1232 if (LIST_FIRST(&bp->b_dep) != NULL) 1168 1233 buf_deallocate(bp); 1169 mtx_lock(&Giant); 1234 if (bp->b_flags & B_DELWRI) { 1170 s = splbio(); 1235 atomic_subtract_int(&numdirtybuffers, 1); 1171 mtx_lock(&nblock); 1236 numdirtywakeup(lodirtybuffers); 1172 while (numdirtybuffers >= hidirtybuffers) { 1237 } 1173 bd_wakeup(1); 1238 bp->b_flags &= ˜(B_DELWRI | B_CACHE); 1174 needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; 1239 if ((bp->b_flags & B_VMIO) == 0) { 1175 msleep(&needsbuffer, &nblock, 1240 if (bp->b_bufsize) 1176 (PRIBIO + 4), "flswai", 0); 1241 allocbuf(bp, 0); 1177 } 1242 if (bp->b_vp) 1178 splx(s); 1243 brelvp(bp); 1179 mtx_unlock(&nblock); 1244 } 1180 mtx_unlock(&Giant); 1245 } 1181 } 1246 1182 } 1247 /* 1183 1248 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() 1184 /* 1249 * is called with B_DELWRI set, the underlying pages may wind up 1185 * Return true if we have too many dirty buffers. 1250 * getting freed causing a previous write (bdwrite()) to get ’lost’ 1186 */ 1251 * because pages associated with a B_DELWRI bp are marked clean. 1187 int 1252 * 1188 buf_dirty_count_severe(void) 1253 * We still allow the B_INVAL case to call vfs_vmio_release(), even 1189 { 1254 * if B_DELWRI is set. 1190 return(numdirtybuffers >= hidirtybuffers); 1255 * 1191 } 1256 * If B_DELWRI is not set we may have to set B_RELBUF if we are low 1192 1257 * on pages to return pages to the VM page queues. 1193 /* 1258 */ 1194 * brelse: 1259 if (bp->b_flags & B_DELWRI) 1195 * 1260 bp->b_flags &= ˜B_RELBUF; 1196 * Release a busy buffer and, if requested, free its resources. The 1261 else if (vm_page_count_severe()) { 1197 * buffer will be stashed in the appropriate bufqueue[] allowing it 1262 /* 1198 * to be accessed later as a cache entity or reused for other purposes. 1263 * XXX This lock may not be necessary since BKGRDINPROG 1199 */ 1264 * cannot be set while we hold the buf lock, it can only be 1200 void 1265 * cleared if it is already pending. 1201 brelse(struct buf * bp) 1266 */ 1202 { 1267 if (bp->b_vp) { 1203 int s; 1268 VI_LOCK(bp->b_vp); 1204 1269 if (!(bp->b_vflags & BV_BKGRDINPROG)) 1205 GIANT_REQUIRED; 1270 bp->b_flags |= B_RELBUF; 1206 1271 VI_UNLOCK(bp->b_vp); 1207 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1272 } else 1208 ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1273 bp->b_flags |= B_RELBUF; 1209 1274 } 1210 s = splbio(); 1275 1211 1276 /* 1212 if (bp->b_iocmd == BIO_WRITE && 1277 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffe 1213 (bp->b_ioflags & BIO_ERROR) && r 1214 !(bp->b_flags & B_INVAL)) { 1278 * constituted, not even NFS buffers now. Two flags effect this. If 1215 /* 1279 * B_INVAL, the struct buf is invalidated but the VM object is kept 1216 * Failed write, redirty. Must clear BIO_ERROR to prevent 1280 * around ( i.e. so it is trivial to reconstitute the buffer later ). 1217 * pages from being scrapped. If B_INVAL is set then 1281 * 1218 * this case is not run and the next case is run to 1282 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be 1219 * destroy the buffer. B_INVAL can occur if the buffer 1283 * invalidated. BIO_ERROR cannot be set for a failed write unless the 1220 * is outside the range supported by the underlying device. 1284 * buffer is also B_INVAL because it hits the re-dirtying code above. 1221 */ 1285 * 1222 bp->b_ioflags &= ˜BIO_ERROR; 1286 * Normally we can do this whether a buffer is B_DELWRI or not. If 1223 bdirty(bp); 1287 * the buffer is an NFS buffer, it is tracking piecemeal writes or 11/15/03 01:28:09 sys/kern/vfs_bio.c 11 1288 * the commit state and we cannot afford to lose the buffer. If the 1350 } 1289 * buffer has a background write in progress, we need to keep it 1351 } 1290 * around to prevent it from being reconstituted and starting a second 1352 1291 * background write. 1353 if ((bp->b_flags & B_INVAL) == 0) { 1292 */ 1354 pmap_qenter(trunc_page((vm_offset_t)bp 1293 if ((bp->b_flags & B_VMIO) ->b_data), bp->b_pages, bp->b_npages); 1294 && !(bp->b_vp->v_mount != NULL && 1355 } 1295 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 & 1356 m = bp->b_pages[i]; & 1357 } 1296 !vn_isdisk(bp->b_vp, NULL) && 1358 if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ 1297 (bp->b_flags & B_DELWRI)) ERROR)) { 1298 ) { 1359 int poffset = foff & PAGE_MASK; 1299 1360 int presid = resid > (PAGE_SIZE - poffset) ? 1300 int i, j, resid; 1361 (PAGE_SIZE - poffset) : resid; 1301 vm_page_t m; 1362 1302 off_t foff; 1363 KASSERT(presid >= 0, ("brelse: extra page")); 1303 vm_pindex_t poff; 1364 vm_page_lock_queues(); 1304 vm_object_t obj; 1365 vm_page_set_invalid(m, poffset, presid); 1305 struct vnode *vp; 1366 vm_page_unlock_queues(); 1306 1367 if (had_bogus) 1307 vp = bp->b_vp; 1368 printf("avoided corruption bug in bogu 1308 obj = bp->b_object; s_page/brelse code\n"); 1309 1369 } 1310 /* 1370 resid -= PAGE_SIZE - (foff & PAGE_MASK); 1311 * Get the base offset and length of the buffer. Note that 1371 foff = (foff + PAGE_SIZE) & ˜(off_t)PAGE_MASK; 1312 * in the VMIO case if the buffer block size is not 1372 } 1313 * page-aligned then b_data pointer may not be page-aligned. 1373 VM_OBJECT_UNLOCK(obj); 1314 * But our b_pages[] array *IS* page aligned. 1374 if (bp->b_flags & (B_INVAL | B_RELBUF)) 1315 * 1375 vfs_vmio_release(bp); 1316 * block sizes less then DEV_BSIZE (usually 512) are not 1376 1317 * supported due to the page granularity bits (m->valid, 1377 } else if (bp->b_flags & B_VMIO) { 1318 * m->dirty, etc...). 1378 1319 * 1379 if (bp->b_flags & (B_INVAL | B_RELBUF)) { 1320 * See man buf(9) for more information 1380 vfs_vmio_release(bp); 1321 */ 1381 } 1322 resid = bp->b_bufsize; 1382 1323 foff = bp->b_offset; 1383 } 1324 VM_OBJECT_LOCK(obj); 1384 1325 for (i = 0; i < bp->b_npages; i++) { 1385 if (bp->b_qindex != QUEUE_NONE) 1326 int had_bogus = 0; 1386 panic("brelse: free buffer onto another queue???"); 1327 1387 if (BUF_REFCNT(bp) > 1) { 1328 m = bp->b_pages[i]; 1388 /* do not release to free list */ 1329 vm_page_lock_queues(); 1389 BUF_UNLOCK(bp); 1330 vm_page_flag_clear(m, PG_ZERO); 1390 splx(s); 1331 vm_page_unlock_queues(); 1391 return; 1332 1392 } 1333 /* 1393 1334 * If we hit a bogus page, fixup *all* the bogus pages 1394 /* enqueue */ 1335 * now. 1395 mtx_lock(&bqlock); 1336 */ 1396 1337 if (m == bogus_page) { 1397 /* buffers with no memory */ 1338 poff = OFF_TO_IDX(bp->b_offset); 1398 if (bp->b_bufsize == 0) { 1339 had_bogus = 1; 1399 bp->b_flags |= B_INVAL; 1340 1400 bp->b_xflags &= ˜(BX_BKGRDWRITE | BX_ALTDATA); 1341 for (j = i; j < bp->b_npages; j++) { 1401 if (bp->b_vflags & BV_BKGRDINPROG) 1342 vm_page_t mtmp; 1402 panic("losing buffer 1"); 1343 mtmp = bp->b_pages[j]; 1403 if (bp->b_kvasize) { 1344 if (mtmp == bogus_page) { 1404 bp->b_qindex = QUEUE_EMPTYKVA; 1345 mtmp = vm_page_lookup(obj, pof 1405 } else { f + j); 1406 bp->b_qindex = QUEUE_EMPTY; 1346 if (!mtmp) { 1407 } 1347 panic("brelse: page mi 1408 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); ssing\n"); 1409 bp->b_dev = NODEV; 1348 } 1410 /* buffers with junk contents */ 1349 bp->b_pages[j] = mtmp; 1411 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || 11/15/03 01:28:09 sys/kern/vfs_bio.c 12 1412 (bp->b_ioflags & BIO_ERROR)) { 1475 * biodone() to requeue an async I/O on completion. It is also used when 1413 bp->b_flags |= B_INVAL; 1476 * known good buffers need to be requeued but we think we may need the data 1414 bp->b_xflags &= ˜(BX_BKGRDWRITE | BX_ALTDATA); 1477 * again soon. 1415 if (bp->b_vflags & BV_BKGRDINPROG) 1478 * 1416 panic("losing buffer 2"); 1479 * XXX we should be able to leave the B_RELBUF hint set on completion. 1417 bp->b_qindex = QUEUE_CLEAN; 1480 */ 1418 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1481 void 1419 bp->b_dev = NODEV; 1482 bqrelse(struct buf * bp) 1420 /* remaining buffers */ 1483 { 1421 } else { 1484 int s; 1422 if (bp->b_flags & B_DELWRI) 1485 1423 bp->b_qindex = QUEUE_DIRTY; 1486 s = splbio(); 1424 else 1487 1425 bp->b_qindex = QUEUE_CLEAN; 1488 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriat 1426 if (bp->b_flags & B_AGE) e B_PAGING or B_CLUSTER bp %p", bp)); 1427 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_free 1489 list); 1490 if (bp->b_qindex != QUEUE_NONE) 1428 else 1491 panic("bqrelse: free buffer onto another queue???"); 1429 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_free 1492 if (BUF_REFCNT(bp) > 1) { list); 1493 /* do not release to free list */ 1430 } 1494 BUF_UNLOCK(bp); 1431 mtx_unlock(&bqlock); 1495 splx(s); 1432 1496 return; 1433 /* 1497 } 1434 * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already 1498 mtx_lock(&bqlock); 1435 * placed the buffer on the correct queue. We must also disassociate 1499 /* buffers with stale but valid contents */ 1436 * the device and vnode for a B_INVAL buffer so gbincore() doesn’t 1500 if (bp->b_flags & B_DELWRI) { 1437 * find it. 1501 bp->b_qindex = QUEUE_DIRTY; 1438 */ 1502 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 1439 if (bp->b_flags & B_INVAL) { 1503 } else { 1440 if (bp->b_flags & B_DELWRI) 1504 /* 1441 bundirty(bp); 1505 * XXX This lock may not be necessary since BKGRDINPROG 1442 if (bp->b_vp) 1506 * cannot be set while we hold the buf lock, it can only be 1443 brelvp(bp); 1507 * cleared if it is already pending. 1444 } 1508 */ 1445 1509 VI_LOCK(bp->b_vp); 1446 /* 1510 if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) 1447 * Fixup numfreebuffers count. The bp is on an appropriate queue { 1448 * unless locked. We then bump numfreebuffers if it is not B_DELWRI. 1511 VI_UNLOCK(bp->b_vp); 1449 * We’ve already handled the B_INVAL case ( B_DELWRI will be clear 1512 bp->b_qindex = QUEUE_CLEAN; 1450 * if B_INVAL is set ). 1513 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, 1451 */ 1514 b_freelist); 1452 1515 } else { 1453 if (!(bp->b_flags & B_DELWRI)) 1516 /* 1454 bufcountwakeup(); 1517 * We are too low on memory, we have to try to free 1455 1518 * the buffer (most importantly: the wired pages 1456 /* 1519 * making up its backing store) *now*. 1457 * Something we can maybe free or reuse 1520 */ 1458 */ 1521 VI_UNLOCK(bp->b_vp); 1459 if (bp->b_bufsize || bp->b_kvasize) 1522 mtx_unlock(&bqlock); 1460 bufspacewakeup(); 1523 splx(s); 1461 1524 brelse(bp); 1462 bp->b_flags &= ˜(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); 1525 return; 1463 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 1526 } 1464 panic("brelse: not dirty"); 1527 } 1465 /* unlock */ 1528 mtx_unlock(&bqlock); 1466 BUF_UNLOCK(bp); 1529 1467 splx(s); 1530 if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) 1468 } 1531 bufcountwakeup(); 1469 1532 1470 /* 1533 /* 1471 * Release a buffer back to the appropriate queue but do not try to free 1534 * Something we can maybe free or reuse. 1472 * it. The buffer is expected to be used again soon. 1535 */ 1473 * 1536 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1474 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1537 bufspacewakeup(); 11/15/03 01:28:09 sys/kern/vfs_bio.c 13 1538 1603 if (bp->b_vp) 1539 bp->b_flags &= ˜(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 1604 brelvp(bp); 1540 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 1605 } 1541 panic("bqrelse: not dirty"); 1606 1542 /* unlock */ 1607 /* 1543 BUF_UNLOCK(bp); 1608 * Check to see if a block at a particular lbn is available for a clustered 1544 splx(s); 1609 * write. 1545 } 1610 */ 1546 1611 static int 1547 /* Give pages used by the bp back to the VM system (where possible) */ 1612 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) 1548 static void 1613 { 1549 vfs_vmio_release(bp) 1614 struct buf *bpa; 1550 struct buf *bp; 1615 int match; 1551 { 1616 1552 int i; 1617 match = 0; 1553 vm_page_t m; 1618 1554 1619 /* If the buf isn’t in core skip it */ 1555 GIANT_REQUIRED; 1620 if ((bpa = gbincore(vp, lblkno)) == NULL) 1556 VM_OBJECT_LOCK(bp->b_object); 1621 return (0); 1557 vm_page_lock_queues(); 1622 1558 for (i = 0; i < bp->b_npages; i++) { 1623 /* If the buf is busy we don’t want to wait for it */ 1559 m = bp->b_pages[i]; 1624 if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 1560 bp->b_pages[i] = NULL; 1625 return (0); 1561 /* 1626 1562 * In order to keep page LRU ordering consistent, put 1627 /* Only cluster with valid clusterable delayed write buffers */ 1563 * everything on the inactive queue. 1628 if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != 1564 */ 1629 (B_DELWRI | B_CLUSTEROK)) 1565 vm_page_unwire(m, 0); 1630 goto done; 1566 /* 1631 1567 * We don’t mess with busy pages, it is 1632 if (bpa->b_bufsize != size) 1568 * the responsibility of the process that 1633 goto done; 1569 * busied the pages to deal with them. 1634 1570 */ 1635 /* 1571 if ((m->flags & PG_BUSY) || (m->busy != 0)) 1636 * Check to see if it is in the expected place on disk and that the 1572 continue; 1637 * block has been mapped. 1573 1638 */ 1574 if (m->wire_count == 0) { 1639 if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) 1575 vm_page_flag_clear(m, PG_ZERO); 1640 match = 1; 1576 /* 1641 done: 1577 * Might as well free the page if we can and it has 1642 BUF_UNLOCK(bpa); 1578 * no valid data. We also free the page if the 1643 return (match); 1579 * buffer was used for direct I/O 1644 } 1580 */ 1645 1581 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && 1646 /* 1582 m->hold_count == 0) { 1647 * vfs_bio_awrite: 1583 vm_page_busy(m); 1648 * 1584 pmap_remove_all(m); 1649 * Implement clustered async writes for clearing out B_DELWRI buffers. 1585 vm_page_free(m); 1650 * This is much better then the old way of writing only one buffer at 1586 } else if (bp->b_flags & B_DIRECT) { 1651 * a time. Note that we may not be presented with the buffers in the 1587 vm_page_try_to_free(m); 1652 * correct order, so we search for the cluster in both directions. 1588 } else if (vm_page_count_severe()) { 1653 */ 1589 vm_page_try_to_cache(m); 1654 int 1590 } 1655 vfs_bio_awrite(struct buf * bp) 1591 } 1656 { 1592 } 1657 int i; 1593 vm_page_unlock_queues(); 1658 int j; 1594 VM_OBJECT_UNLOCK(bp->b_object); 1659 daddr_t lblkno = bp->b_lblkno; 1595 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 1660 struct vnode *vp = bp->b_vp; 1596 1661 int s; 1597 if (bp->b_bufsize) { 1662 int ncl; 1598 bufspacewakeup(); 1663 int nwritten; 1599 bp->b_bufsize = 0; 1664 int size; 1600 } 1665 int maxcl; 1601 bp->b_npages = 0; 1666 1602 bp->b_flags &= ˜B_VMIO; 1667 s = splbio(); 11/15/03 01:28:09 sys/kern/vfs_bio.c 14 1668 /* 1733 * If we have to flush dirty buffers ( but we try to avoid this ) 1669 * right now we support clustered writing only to regular files. If 1734 * 1670 * we find a clusterable block we could be in the middle of a cluster 1735 * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 1671 * rather then at the beginning. 1736 * Instead we ask the buf daemon to do it for us. We attempt to 1672 */ 1737 * avoid piecemeal wakeups of the pageout daemon. 1673 if ((vp->v_type == VREG) && 1738 */ 1674 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 1739 1675 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 1740 static struct buf * 1676 1741 getnewbuf(int slpflag, int slptimeo, int size, int maxsize) 1677 size = vp->v_mount->mnt_stat.f_iosize; 1742 { 1678 maxcl = MAXPHYS / size; 1743 struct buf *bp; 1679 1744 struct buf *nbp; 1680 VI_LOCK(vp); 1745 int defrag = 0; 1681 for (i = 1; i < maxcl; i++) 1746 int nqindex; 1682 if (vfs_bio_clcheck(vp, size, lblkno + i, 1747 static int flushingbufs; 1683 bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) 1748 1684 break; 1749 GIANT_REQUIRED; 1685 1750 1686 for (j = 1; i + j <= maxcl && j <= lblkno; j++) 1751 /* 1687 if (vfs_bio_clcheck(vp, size, lblkno - j, 1752 * We can’t afford to block since we might be holding a vnode lock, 1688 bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) 1753 * which may prevent system daemons from running. We deal with 1689 break; 1754 * low-memory situations by proactively returning memory and running 1690 1755 * async I/O rather then sync I/O. 1691 VI_UNLOCK(vp); 1756 */ 1692 --j; 1757 1693 ncl = i + j; 1758 atomic_add_int(&getnewbufcalls, 1); 1694 /* 1759 atomic_subtract_int(&getnewbufrestarts, 1); 1695 * this is a possible cluster write 1760 restart: 1696 */ 1761 atomic_add_int(&getnewbufrestarts, 1); 1697 if (ncl != 1) { 1762 1698 BUF_UNLOCK(bp); 1763 /* 1699 nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); 1764 * Setup for scan. If we do not have enough free buffers, 1700 splx(s); 1765 * we setup a degenerate case that immediately fails. Note 1701 return nwritten; 1766 * that if we are specially marked process, we are allowed to 1702 } 1767 * dip into our reserves. 1703 } 1768 * 1704 1769 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN 1705 bremfree(bp); 1770 * 1706 bp->b_flags |= B_ASYNC; 1771 * We start with EMPTYKVA. If the list is empty we backup to EMPTY. 1707 1772 * However, there are a number of cases (defragging, reusing, ...) 1708 splx(s); 1773 * where we cannot backup. 1709 /* 1774 */ 1710 * default (old) behavior, writing out only one block 1775 mtx_lock(&bqlock); 1711 * 1776 nqindex = QUEUE_EMPTYKVA; 1712 * XXX returns b_bufsize instead of b_bcount for nwritten? 1777 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); 1713 */ 1778 1714 nwritten = bp->b_bufsize; 1779 if (nbp == NULL) { 1715 (void) BUF_WRITE(bp); 1780 /* 1716 1781 * If no EMPTYKVA buffers and we are either 1717 return nwritten; 1782 * defragging or reusing, locate a CLEAN buffer 1718 } 1783 * to free or reuse. If bufspace useage is low 1719 1784 * skip this step so we can allocate a new buffer. 1720 /* 1785 */ 1721 * getnewbuf: 1786 if (defrag || bufspace >= lobufspace) { 1722 * 1787 nqindex = QUEUE_CLEAN; 1723 * Find and initialize a new buffer header, freeing up existing buffers 1788 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 1724 * in the bufqueues as necessary. The new buffer is returned locked. 1789 } 1725 * 1790 1726 * Important: B_INVAL is not set. If the caller wishes to throw the 1791 /* 1727 * buffer away, the caller must set B_INVAL prior to calling brelse(). 1792 * If we could not find or were not allowed to reuse a 1728 * 1793 * CLEAN buffer, check to see if it is ok to use an EMPTY 1729 * We block if: 1794 * buffer. We can only use an EMPTY buffer if allocating 1730 * We have insufficient buffer headers 1795 * its KVA would not otherwise run us out of buffer space. 1731 * We have insufficient buffer space 1796 */ 1732 * buffer_map is too fragmented ( space reservation fails ) 1797 if (nbp == NULL && defrag == 0 && 11/15/03 01:28:09 sys/kern/vfs_bio.c 15 1798 bufspace + maxsize < hibufspace) { 1859 * occur, if defrag is non-zero the buffer’s b_kvasize 1799 nqindex = QUEUE_EMPTY; 1860 * should also be non-zero at this point. XXX 1800 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); 1861 */ 1801 } 1862 if (defrag && bp->b_kvasize == 0) { 1802 } 1863 printf("Warning: defrag empty buffer %p\n", bp); 1803 1864 continue; 1804 /* 1865 } 1805 * Run scan, possibly freeing data and/or kva mappings on the fly 1866 1806 * depending. 1867 /* 1807 */ 1868 * Start freeing the bp. This is somewhat involved. nbp 1808 1869 * remains valid only for QUEUE_EMPTY[KVA] bp’s. 1809 while ((bp = nbp) != NULL) { 1870 */ 1810 int qindex = nqindex; 1871 1811 1872 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 1812 /* 1873 panic("getnewbuf: locked buf"); 1813 * Calculate next bp ( we can only use it if we do not block 1874 bremfreel(bp); 1814 * or do other fancy things ). 1875 mtx_unlock(&bqlock); 1815 */ 1876 1816 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { 1877 if (qindex == QUEUE_CLEAN) { 1817 switch(qindex) { 1878 if (bp->b_flags & B_VMIO) { 1818 case QUEUE_EMPTY: 1879 bp->b_flags &= ˜B_ASYNC; 1819 nqindex = QUEUE_EMPTYKVA; 1880 vfs_vmio_release(bp); 1820 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYK 1881 } VA]))) 1882 if (bp->b_vp) 1821 break; 1883 brelvp(bp); 1822 /* FALLTHROUGH */ 1884 } 1823 case QUEUE_EMPTYKVA: 1885 1824 nqindex = QUEUE_CLEAN; 1886 /* 1825 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN] 1887 * NOTE: nbp is now entirely invalid. We can only restart ))) 1888 * the scan from this point on. 1826 break; 1889 * 1827 /* FALLTHROUGH */ 1890 * Get the rest of the buffer freed up. b_kva* is still 1828 case QUEUE_CLEAN: 1891 * valid after this operation. 1829 /* 1892 */ 1830 * nbp is NULL. 1893 1831 */ 1894 if (bp->b_rcred != NOCRED) { 1832 break; 1895 crfree(bp->b_rcred); 1833 } 1896 bp->b_rcred = NOCRED; 1834 } 1897 } 1835 if (bp->b_vp) { 1898 if (bp->b_wcred != NOCRED) { 1836 VI_LOCK(bp->b_vp); 1899 crfree(bp->b_wcred); 1837 if (bp->b_vflags & BV_BKGRDINPROG) { 1900 bp->b_wcred = NOCRED; 1838 VI_UNLOCK(bp->b_vp); 1901 } 1839 continue; 1902 if (LIST_FIRST(&bp->b_dep) != NULL) 1840 } 1903 buf_deallocate(bp); 1841 VI_UNLOCK(bp->b_vp); 1904 if (bp->b_vflags & BV_BKGRDINPROG) 1842 } 1905 panic("losing buffer 3"); 1843 1906 1844 /* 1907 if (bp->b_bufsize) 1845 * Sanity Checks 1908 allocbuf(bp, 0); 1846 */ 1909 1847 KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queu 1910 bp->b_flags = 0; e %d bp %p", qindex, bp)); 1911 bp->b_ioflags = 0; 1848 1912 bp->b_xflags = 0; 1849 /* 1913 bp->b_vflags = 0; 1850 * Note: we no longer distinguish between VMIO and non-VMIO 1914 bp->b_dev = NODEV; 1851 * buffers. 1915 bp->b_vp = NULL; 1852 */ 1916 bp->b_blkno = bp->b_lblkno = 0; 1853 1917 bp->b_offset = NOOFFSET; 1854 KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p foun 1918 bp->b_iodone = 0; d in queue %d", bp, qindex)); 1919 bp->b_error = 0; 1855 1920 bp->b_resid = 0; 1856 /* 1921 bp->b_bcount = 0; 1857 * If we are defragging then we need a buffer with 1922 bp->b_npages = 0; 1858 * b_kvasize != 0. XXX this situation should no longer 1923 bp->b_dirtyoff = bp->b_dirtyend = 0; 11/15/03 01:28:09 sys/kern/vfs_bio.c 16 1924 bp->b_magic = B_MAGIC_BIO; 1989 mtx_unlock(&nblock); 1925 bp->b_op = &buf_ops_bio; 1990 return (NULL); 1926 bp->b_object = NULL; 1991 } 1927 1992 } 1928 LIST_INIT(&bp->b_dep); 1993 mtx_unlock(&nblock); 1929 1994 } else { 1930 /* 1995 /* 1931 * If we are defragging then free the buffer. 1996 * We finally have a valid bp. We aren’t quite out of the 1932 */ 1997 * woods, we still have to reserve kva space. In order 1933 if (defrag) { 1998 * to keep fragmentation sane we only allocate kva in 1934 bp->b_flags |= B_INVAL; 1999 * BKVASIZE chunks. 1935 bfreekva(bp); 2000 */ 1936 brelse(bp); 2001 maxsize = (maxsize + BKVAMASK) & ˜BKVAMASK; 1937 defrag = 0; 2002 1938 goto restart; 2003 if (maxsize != bp->b_kvasize) { 1939 } 2004 vm_offset_t addr = 0; 1940 2005 1941 /* 2006 bfreekva(bp); 1942 * If we are overcomitted then recover the buffer and its 2007 1943 * KVM space. This occurs in rare situations when multiple 2008 if (vm_map_findspace(buffer_map, 1944 * processes are blocked in getnewbuf() or allocbuf(). 2009 vm_map_min(buffer_map), maxsize, &addr)) { 1945 */ 2010 /* 1946 if (bufspace >= hibufspace) 2011 * Uh oh. Buffer map is to fragmented. We 1947 flushingbufs = 1; 2012 * must defragment the map. 1948 if (flushingbufs && bp->b_kvasize != 0) { 2013 */ 1949 bp->b_flags |= B_INVAL; 2014 atomic_add_int(&bufdefragcnt, 1); 1950 bfreekva(bp); 2015 defrag = 1; 1951 brelse(bp); 2016 bp->b_flags |= B_INVAL; 1952 goto restart; 2017 brelse(bp); 1953 } 2018 goto restart; 1954 if (bufspace < lobufspace) 2019 } 1955 flushingbufs = 0; 2020 if (addr) { 1956 break; 2021 vm_map_insert(buffer_map, NULL, 0, 1957 } 2022 addr, addr + maxsize, 1958 2023 VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT) 1959 /* ; 1960 * If we exhausted our list, sleep as appropriate. We may have to 2024 1961 * wakeup various daemons and write out some dirty buffers. 2025 bp->b_kvabase = (caddr_t) addr; 1962 * 2026 bp->b_kvasize = maxsize; 1963 * Generally we are sleeping due to insufficient buffer space. 2027 atomic_add_int(&bufspace, bp->b_kvasize); 1964 */ 2028 atomic_add_int(&bufreusecnt, 1); 1965 2029 } 1966 if (bp == NULL) { 2030 } 1967 int flags; 2031 bp->b_saveaddr = bp->b_kvabase; 1968 char *waitmsg; 2032 bp->b_data = bp->b_saveaddr; 1969 2033 } 1970 mtx_unlock(&bqlock); 2034 return(bp); 1971 if (defrag) { 2035 } 1972 flags = VFS_BIO_NEED_BUFSPACE; 2036 1973 waitmsg = "nbufkv"; 2037 /* 1974 } else if (bufspace >= hibufspace) { 2038 * buf_daemon: 1975 waitmsg = "nbufbs"; 2039 * 1976 flags = VFS_BIO_NEED_BUFSPACE; 2040 * buffer flushing daemon. Buffers are normally flushed by the 1977 } else { 2041 * update daemon but if it cannot keep up this process starts to 1978 waitmsg = "newbuf"; 2042 * take the load in an attempt to prevent getnewbuf() from blocking. 1979 flags = VFS_BIO_NEED_ANY; 2043 */ 1980 } 2044 1981 2045 static struct kproc_desc buf_kp = { 1982 bd_speedup(); /* heeeelp */ 2046 "bufdaemon", 1983 2047 buf_daemon, 1984 mtx_lock(&nblock); 2048 &bufdaemonproc 1985 needsbuffer |= flags; 2049 }; 1986 while (needsbuffer & flags) { 2050 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) 1987 if (msleep(&needsbuffer, &nblock, 2051 1988 (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { 2052 static void 11/15/03 01:28:09 sys/kern/vfs_bio.c 17 2053 buf_daemon() 2117 /* 2054 { 2118 * We couldn’t find any flushable dirty buffers but 2055 int s; 2119 * still have too many dirty buffers, we 2056 2120 * have to sleep and try again. (rare) 2057 mtx_lock(&Giant); 2121 */ 2058 2122 msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); 2059 /* 2123 } 2060 * This process needs to be suspended prior to shutdown sync. 2124 } 2061 */ 2125 } 2062 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc 2126 , 2127 /* 2063 SHUTDOWN_PRI_LAST); 2128 * flushbufqueues: 2064 2129 * 2065 /* 2130 * Try to flush a buffer in the dirty queue. We must be careful to 2066 * This process is allowed to take the buffer cache to the limit 2131 * free up B_INVAL buffers instead of write them, which NFS is 2067 */ 2132 * particularly sensitive to. 2068 s = splbio(); 2133 */ 2069 mtx_lock(&bdlock); 2134 int flushwithdeps = 0; 2070 2135 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 2071 for (;;) { 2136 0, "Number of buffers flushed with dependecies that require rollbacks"); 2072 bd_request = 0; 2137 static int 2073 mtx_unlock(&bdlock); 2138 flushbufqueues(int flushdeps) 2074 2139 { 2075 kthread_suspend_check(bufdaemonproc); 2140 struct thread *td = curthread; 2076 2141 struct vnode *vp; 2077 /* 2142 struct mount *mp; 2078 * Do the flush. Limit the amount of in-transit I/O we 2143 struct buf *bp; 2079 * allow to build up, otherwise we would completely saturate 2144 int hasdeps; 2080 * the I/O system. Wakeup any waiting processes before we 2145 2081 * normally would so they can run in parallel with our drain. 2146 mtx_lock(&bqlock); 2082 */ 2147 TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) { 2083 while (numdirtybuffers > lodirtybuffers) { 2148 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 2084 if (flushbufqueues(0) == 0) { 2149 continue; 2085 /* 2150 KASSERT((bp->b_flags & B_DELWRI), 2086 * Could not find any buffers without rollback 2151 ("unexpected clean buffer %p", bp)); 2087 * dependencies, so just write the first one 2152 VI_LOCK(bp->b_vp); 2088 * in the hopes of eventually making progress. 2153 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 2089 */ 2154 VI_UNLOCK(bp->b_vp); 2090 flushbufqueues(1); 2155 BUF_UNLOCK(bp); 2091 break; 2156 continue; 2092 } 2157 } 2093 waitrunningbufspace(); 2158 VI_UNLOCK(bp->b_vp); 2094 numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); 2159 if (bp->b_flags & B_INVAL) { 2095 } 2160 bremfreel(bp); 2096 2161 mtx_unlock(&bqlock); 2097 /* 2162 brelse(bp); 2098 * Only clear bd_request if we have reached our low water 2163 return (1); 2099 * mark. The buf_daemon normally waits 1 second and 2164 } 2100 * then incrementally flushes any dirty buffers that have 2165 2101 * built up, within reason. 2166 if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0)) { 2102 * 2167 if (flushdeps == 0) { 2103 * If we were unable to hit our low water mark and couldn’t 2168 BUF_UNLOCK(bp); 2104 * find any flushable buffers, we sleep half a second. 2169 continue; 2105 * Otherwise we loop immediately. 2170 } 2106 */ 2171 hasdeps = 1; 2107 mtx_lock(&bdlock); 2172 } else 2108 if (numdirtybuffers <= lodirtybuffers) { 2173 hasdeps = 0; 2109 /* 2174 /* 2110 * We reached our low water mark, reset the 2175 * We must hold the lock on a vnode before writing 2111 * request and sleep until we are needed again. 2176 * one of its buffers. Otherwise we may confuse, or 2112 * The sleep is just so the suspend code works. 2177 * in the case of a snapshot vnode, deadlock the 2113 */ 2178 * system. 2114 bd_request = 0; 2179 * 2115 msleep(&bd_request, &bdlock, PVM, "psleep", hz); 2180 * The lock order here is the reverse of the normal 2116 } else { 2181 * of vnode followed by buf lock. This is ok because 11/15/03 01:28:09 sys/kern/vfs_bio.c 18 2182 * the NOWAIT will prevent deadlock. 2247 off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosi 2183 */ ze; 2184 vp = bp->b_vp; 2248 2185 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2249 VM_OBJECT_LOCK(obj); 2186 BUF_UNLOCK(bp); 2250 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 2187 continue; 2251 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 2188 } 2252 if (!m) 2189 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { 2253 goto notinmem; 2190 mtx_unlock(&bqlock); 2254 tinc = size; 2191 vfs_bio_awrite(bp); 2255 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) 2192 vn_finished_write(mp); 2256 tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); 2193 VOP_UNLOCK(vp, 0, td); 2257 if (vm_page_is_valid(m, 2194 flushwithdeps += hasdeps; 2258 (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) 2195 return (1); 2259 goto notinmem; 2196 } 2260 } 2197 vn_finished_write(mp); 2261 VM_OBJECT_UNLOCK(obj); 2198 BUF_UNLOCK(bp); 2262 return 1; 2199 } 2263 2200 mtx_unlock(&bqlock); 2264 notinmem: 2201 return (0); 2265 VM_OBJECT_UNLOCK(obj); 2202 } 2266 return (0); 2203 2267 } 2204 /* 2268 2205 * Check to see if a block is currently memory resident. 2269 /* 2206 */ 2270 * vfs_setdirty: 2207 struct buf * 2271 * 2208 incore(struct vnode * vp, daddr_t blkno) 2272 * Sets the dirty range for a buffer based on the status of the dirty 2209 { 2273 * bits in the pages comprising the buffer. 2210 struct buf *bp; 2274 * 2211 2275 * The range is limited to the size of the buffer. 2212 int s = splbio(); 2276 * 2213 VI_LOCK(vp); 2277 * This routine is primarily used by NFS, but is generalized for the 2214 bp = gbincore(vp, blkno); 2278 * B_VMIO case. 2215 VI_UNLOCK(vp); 2279 */ 2216 splx(s); 2280 static void 2217 return (bp); 2281 vfs_setdirty(struct buf *bp) 2218 } 2282 { 2219 2283 int i; 2220 /* 2284 vm_object_t object; 2221 * Returns true if no I/O is needed to access the 2285 2222 * associated VM object. This is like incore except 2286 GIANT_REQUIRED; 2223 * it also hunts around in the VM system for the data. 2287 /* 2224 */ 2288 * Degenerate case - empty buffer 2225 2289 */ 2226 int 2290 2227 inmem(struct vnode * vp, daddr_t blkno) 2291 if (bp->b_bufsize == 0) 2228 { 2292 return; 2229 vm_object_t obj; 2293 2230 vm_offset_t toff, tinc, size; 2294 /* 2231 vm_page_t m; 2295 * We qualify the scan for modified pages on whether the 2232 vm_ooffset_t off; 2296 * object has been flushed yet. The OBJ_WRITEABLE flag 2233 2297 * is not cleared simply by protecting pages off. 2234 GIANT_REQUIRED; 2298 */ 2235 ASSERT_VOP_LOCKED(vp, "inmem"); 2299 2236 2300 if ((bp->b_flags & B_VMIO) == 0) 2237 if (incore(vp, blkno)) 2301 return; 2238 return 1; 2302 2239 if (vp->v_mount == NULL) 2303 object = bp->b_pages[0]->object; 2240 return 0; 2304 VM_OBJECT_LOCK(object); 2241 if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_vflag & VV_OBJBUF) == 0) 2305 if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDI 2242 return 0; RTY)) 2243 2306 printf("Warning: object %p writeable but not mightbedirty\n", 2244 size = PAGE_SIZE; object); 2245 if (size > vp->v_mount->mnt_stat.f_iosize) 2307 if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDI 2246 size = vp->v_mount->mnt_stat.f_iosize; RTY)) 11/15/03 01:28:09 sys/kern/vfs_bio.c 19 2308 printf("Warning: object %p mightbedirty but not writeable\n", 2370 * ready for an I/O initiation. B_INVAL may or may not be set on object); 2371 * return. The caller should clear B_INVAL prior to initiating a 2309 2372 * READ. 2310 if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { 2373 * 2311 vm_offset_t boffset; 2374 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 2312 vm_offset_t eoffset; 2375 * an existing buffer. 2313 2376 * 2314 vm_page_lock_queues(); 2377 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 2315 /* 2378 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 2316 * test the pages to see if they have been modified directly 2379 * and then cleared based on the backing VM. If the previous buffer is 2317 * by users through the VM system. 2380 * non-0-sized but invalid, B_CACHE will be cleared. 2318 */ 2381 * 2319 for (i = 0; i < bp->b_npages; i++) { 2382 * If getblk() must create a new buffer, the new buffer is returned with 2320 vm_page_flag_clear(bp->b_pages[i], PG_ZERO); 2383 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 2321 vm_page_test_dirty(bp->b_pages[i]); 2384 * case it is returned with B_INVAL clear and B_CACHE set based on the 2322 } 2385 * backing VM. 2323 2386 * 2324 /* 2387 * getblk() also forces a BUF_WRITE() for any B_DELWRI buffer whos 2325 * Calculate the encompassing dirty range, boffset and eoffset 2388 * B_CACHE bit is clear. , 2389 * 2326 * (eoffset - boffset) bytes. 2390 * What this means, basically, is that the caller should use B_CACHE to 2327 */ 2391 * determine whether the buffer is fully valid or not and should clear 2328 2392 * B_INVAL prior to issuing a read. If the caller intends to validate 2329 for (i = 0; i < bp->b_npages; i++) { 2393 * the buffer by loading its data area with something, the caller needs 2330 if (bp->b_pages[i]->dirty) 2394 * to clear B_INVAL. If the caller does this without issuing an I/O, 2331 break; 2395 * the caller should set B_CACHE ( as an optimization ), else the caller 2332 } 2396 * should issue the I/O and biodone() will set B_CACHE if the I/O was 2333 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 2397 * a write attempt or if it was a successfull read. If the caller 2334 2398 * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR 2335 for (i = bp->b_npages - 1; i >= 0; --i) { 2399 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 2336 if (bp->b_pages[i]->dirty) { 2400 */ 2337 break; 2401 struct buf * 2338 } 2402 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, 2339 } 2403 int flags) 2340 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK) 2404 { ; 2405 struct buf *bp; 2341 2406 int s; 2342 vm_page_unlock_queues(); 2407 int error; 2343 /* 2408 ASSERT_VOP_LOCKED(vp, "getblk"); 2344 * Fit it to the buffer. 2409 2345 */ 2410 if (size > MAXBSIZE) 2346 2411 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 2347 if (eoffset > bp->b_bcount) 2412 2348 eoffset = bp->b_bcount; 2413 s = splbio(); 2349 2414 loop: 2350 /* 2415 /* 2351 * If we have a good dirty range, merge with the existing 2416 * Block if we are low on buffers. Certain processes are allowed 2352 * dirty range. 2417 * to completely exhaust the buffer cache. 2353 */ 2418 * 2354 2419 * If this check ever becomes a bottleneck it may be better to 2355 if (boffset < eoffset) { 2420 * move it into the else, when gbincore() fails. At the moment 2356 if (bp->b_dirtyoff > boffset) 2421 * it isn’t a problem. 2357 bp->b_dirtyoff = boffset; 2422 * 2358 if (bp->b_dirtyend < eoffset) 2423 * XXX remove if 0 sections (clean this up after its proven) 2359 bp->b_dirtyend = eoffset; 2424 */ 2360 } 2425 if (numfreebuffers == 0) { 2361 } 2426 if (curthread == PCPU_GET(idlethread)) 2362 VM_OBJECT_UNLOCK(object); 2427 return NULL; 2363 } 2428 mtx_lock(&nblock); 2364 2429 needsbuffer |= VFS_BIO_NEED_ANY; 2365 /* 2430 mtx_unlock(&nblock); 2366 * getblk: 2431 } 2367 * 2432 2368 * Get a block given a specified block and offset into a file/device. 2433 VI_LOCK(vp); 2369 * The buffers B_DONE bit will be cleared on return, making it almost 2434 if ((bp = gbincore(vp, blkno))) { 11/15/03 01:28:09 sys/kern/vfs_bio.c 20 2435 int lockflags; 2500 2436 /* 2501 if (bp->b_bcount != size) 2437 * Buffer is in-core. If the buffer is not busy, it must 2502 allocbuf(bp, size); 2438 * be on a queue. 2503 2439 */ 2504 KASSERT(bp->b_offset != NOOFFSET, 2440 lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; 2505 ("getblk: no buffer offset")); 2441 2506 2442 if (flags & GB_LOCK_NOWAIT) 2507 /* 2443 lockflags |= LK_NOWAIT; 2508 * A buffer with B_DELWRI set and B_CACHE clear must 2444 2509 * be committed before we can return the buffer in 2445 error = BUF_TIMELOCK(bp, lockflags, 2510 * order to prevent the caller from issuing a read 2446 VI_MTX(vp), "getblk", slpflag, slptimeo); 2511 * ( due to B_CACHE not being set ) and overwriting 2447 2512 * it. 2448 /* 2513 * 2449 * If we slept and got the lock we have to restart in case 2514 * Most callers, including NFS and FFS, need this to 2450 * the buffer changed identities. 2515 * operate properly either because they assume they 2451 */ 2516 * can issue a read if B_CACHE is not set, or because 2452 if (error == ENOLCK) 2517 * ( for example ) an uncached B_DELWRI might loop due 2453 goto loop; 2518 * to softupdates re-dirtying the buffer. In the latter 2454 /* We timed out or were interrupted. */ 2519 * case, B_CACHE is set after the first write completes, 2455 else if (error) 2520 * preventing further loops. 2456 return (NULL); 2521 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 2457 2522 * above while extending the buffer, we cannot allow the 2458 /* 2523 * buffer to remain with B_CACHE set after the write 2459 * The buffer is locked. B_CACHE is cleared if the buffer is 2524 * completes or it will represent a corrupt state. To 2460 * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set 2525 * deal with this we set B_NOCACHE to scrap the buffer 2461 * and for a VMIO buffer B_CACHE is adjusted according to the 2526 * after the write. 2462 * backing VM cache. 2527 * 2463 */ 2528 * We might be able to do something fancy, like setting 2464 if (bp->b_flags & B_INVAL) 2529 * B_CACHE in bwrite() except if B_DELWRI is already set, 2465 bp->b_flags &= ˜B_CACHE; 2530 * so the below call doesn’t set B_CACHE, but that gets real 2466 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) 2531 * confusing. This is much easier. 2467 bp->b_flags |= B_CACHE; 2532 */ 2468 bremfree(bp); 2533 2469 2534 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 2470 /* 2535 bp->b_flags |= B_NOCACHE; 2471 * check for size inconsistancies for non-VMIO case. 2536 BUF_WRITE(bp); 2472 */ 2537 goto loop; 2473 2538 } 2474 if (bp->b_bcount != size) { 2539 2475 if ((bp->b_flags & B_VMIO) == 0 || 2540 splx(s); 2476 (size > bp->b_kvasize)) { 2541 bp->b_flags &= ˜B_DONE; 2477 if (bp->b_flags & B_DELWRI) { 2542 } else { 2478 bp->b_flags |= B_NOCACHE; 2543 int bsize, maxsize, vmio; 2479 BUF_WRITE(bp); 2544 off_t offset; 2480 } else { 2545 2481 if ((bp->b_flags & B_VMIO) && 2546 /* 2482 (LIST_FIRST(&bp->b_dep) == NULL)) { 2547 * Buffer is not in-core, create new buffer. The buffer 2483 bp->b_flags |= B_RELBUF; 2548 * returned by getnewbuf() is locked. Note that the returned 2484 brelse(bp); 2549 * buffer is also considered valid (not marked B_INVAL). 2485 } else { 2550 */ 2486 bp->b_flags |= B_NOCACHE; 2551 VI_UNLOCK(vp); 2487 BUF_WRITE(bp); 2552 /* 2488 } 2553 * If the user does not want us to create the buffer, bail out 2489 } 2554 * here. 2490 goto loop; 2555 */ 2491 } 2556 if (flags & GB_NOCREAT) { 2492 } 2557 splx(s); 2493 2558 return NULL; 2494 /* 2559 } 2495 * If the size is inconsistant in the VMIO case, we can resize 2560 if (vn_isdisk(vp, NULL)) 2496 * the buffer. This might lead to B_CACHE getting set or 2561 bsize = DEV_BSIZE; 2497 * cleared. If the size has not changed, B_CACHE remains 2562 else if (vp->v_mountedhere) 2498 * unchanged from its previous state. 2563 bsize = vp->v_mountedhere->mnt_stat.f_iosize; 2499 */ 2564 else if (vp->v_mount) 11/15/03 01:28:09 sys/kern/vfs_bio.c 21 2565 bsize = vp->v_mount->mnt_stat.f_iosize; 2628 #endif 2566 else 2629 VOP_GETVOBJECT(vp, &bp->b_object); 2567 bsize = size; 2630 } else { 2568 2631 bp->b_flags &= ˜B_VMIO; 2569 offset = blkno * bsize; 2632 bp->b_object = NULL; 2570 vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && 2633 } 2571 (vp->v_vflag & VV_OBJBUF); 2634 2572 maxsize = vmio ? size + (offset & PAGE_MASK) : size; 2635 allocbuf(bp, size); 2573 maxsize = imax(maxsize, bsize); 2636 2574 2637 splx(s); 2575 if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL 2638 bp->b_flags &= ˜B_DONE; ) { 2639 } 2576 if (slpflag || slptimeo) { 2640 KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp)); 2577 splx(s); 2641 return (bp); 2578 return NULL; 2642 } 2579 } 2643 2580 goto loop; 2644 /* 2581 } 2645 * Get an empty, disassociated buffer of given size. The buffer is initially 2582 2646 * set to B_INVAL. 2583 /* 2647 */ 2584 * This code is used to make sure that a buffer is not 2648 struct buf * 2585 * created while the getnewbuf routine is blocked. 2649 geteblk(int size) 2586 * This can be a problem whether the vnode is locked or not. 2650 { 2587 * If the buffer is created out from under us, we have to 2651 struct buf *bp; 2588 * throw away the one we just created. There is now window 2652 int s; 2589 * race because we are safely running at splbio() from the 2653 int maxsize; 2590 * point of the duplicate buffer creation through to here, 2654 2591 * and we’ve locked the buffer. 2655 maxsize = (size + BKVAMASK) & ˜BKVAMASK; 2592 * 2656 2593 * Note: this must occur before we associate the buffer 2657 s = splbio(); 2594 * with the vp especially considering limitations in 2658 while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) 2595 * the splay tree implementation when dealing with duplicate 2659 continue; 2596 * lblkno’s. 2660 splx(s); 2597 */ 2661 allocbuf(bp, size); 2598 VI_LOCK(vp); 2662 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 2599 if (gbincore(vp, blkno)) { 2663 KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp)); 2600 VI_UNLOCK(vp); 2664 return (bp); 2601 bp->b_flags |= B_INVAL; 2665 } 2602 brelse(bp); 2666 2603 goto loop; 2667 2604 } 2668 /* 2605 2669 * This code constitutes the buffer memory from either anonymous system 2606 /* 2670 * memory (in the case of non-VMIO operations) or from an associated 2607 * Insert the buffer into the hash, so that it can 2671 * VM object (in the case of VMIO operations). This code is able to 2608 * be found by incore. 2672 * resize a buffer up or down. 2609 */ 2673 * 2610 bp->b_blkno = bp->b_lblkno = blkno; 2674 * Note that this code is tricky, and has many complications to resolve 2611 bp->b_offset = offset; 2675 * deadlock or inconsistant data situations. Tread lightly!!! 2612 2676 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 2613 bgetvp(vp, bp); 2677 * the caller. Calling this code willy nilly can result in the loss of data. 2614 VI_UNLOCK(vp); 2678 * 2615 2679 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 2616 /* 2680 * B_CACHE for the non-VMIO case. 2617 * set B_VMIO bit. allocbuf() the buffer bigger. Since the 2681 */ 2618 * buffer size starts out as 0, B_CACHE will be set by 2682 2619 * allocbuf() for the VMIO case prior to it testing the 2683 int 2620 * backing store for validity. 2684 allocbuf(struct buf *bp, int size) 2621 */ 2685 { 2622 2686 int newbsize, mbsize; 2623 if (vmio) { 2687 int i; 2624 bp->b_flags |= B_VMIO; 2688 2625 #if defined(VFS_BIO_DEBUG) 2689 GIANT_REQUIRED; 2626 if (vp->v_type != VREG) 2690 2627 printf("getblk: vmioing file type %d???\n", vp 2691 if (BUF_REFCNT(bp) == 0) ->v_type); 2692 panic("allocbuf: buffer not busy"); 11/15/03 01:28:09 sys/kern/vfs_bio.c 22 2693 2756 bp->b_flags |= B_MALLOC; 2694 if (bp->b_kvasize < size) 2757 atomic_add_int(&bufmallocspace, mbsize); 2695 panic("allocbuf: buffer too small"); 2758 return 1; 2696 2759 } 2697 if ((bp->b_flags & B_VMIO) == 0) { 2760 origbuf = NULL; 2698 caddr_t origbuf; 2761 origbufsize = 0; 2699 int origbufsize; 2762 /* 2700 /* 2763 * If the buffer is growing on its other-than-first al 2701 * Just get anonymous memory from the kernel. Don’t location, 2702 * mess with B_CACHE. 2764 * then we revert to the page-allocation scheme. 2703 */ 2765 */ 2704 mbsize = (size + DEV_BSIZE - 1) & ˜(DEV_BSIZE - 1); 2766 if (bp->b_flags & B_MALLOC) { 2705 if (bp->b_flags & B_MALLOC) 2767 origbuf = bp->b_data; 2706 newbsize = mbsize; 2768 origbufsize = bp->b_bufsize; 2707 else 2769 bp->b_data = bp->b_kvabase; 2708 newbsize = round_page(size); 2770 if (bp->b_bufsize) { 2709 2771 atomic_subtract_int(&bufmallocspace, 2710 if (newbsize < bp->b_bufsize) { 2772 bp->b_bufsize); 2711 /* 2773 bufspacewakeup(); 2712 * malloced buffers are not shrunk 2774 bp->b_bufsize = 0; 2713 */ 2775 } 2714 if (bp->b_flags & B_MALLOC) { 2776 bp->b_flags &= ˜B_MALLOC; 2715 if (newbsize) { 2777 newbsize = round_page(newbsize); 2716 bp->b_bcount = size; 2778 } 2717 } else { 2779 vm_hold_load_pages( 2718 free(bp->b_data, M_BIOBUF); 2780 bp, 2719 if (bp->b_bufsize) { 2781 (vm_offset_t) bp->b_data + bp->b_bufsize, 2720 atomic_subtract_int( 2782 (vm_offset_t) bp->b_data + newbsize); 2721 &bufmallocspace, 2783 if (origbuf) { 2722 bp->b_bufsize); 2784 bcopy(origbuf, bp->b_data, origbufsize); 2723 bufspacewakeup(); 2785 free(origbuf, M_BIOBUF); 2724 bp->b_bufsize = 0; 2786 } 2725 } 2787 } 2726 bp->b_saveaddr = bp->b_kvabase; 2788 } else { 2727 bp->b_data = bp->b_saveaddr; 2789 int desiredpages; 2728 bp->b_bcount = 0; 2790 2729 bp->b_flags &= ˜B_MALLOC; 2791 newbsize = (size + DEV_BSIZE - 1) & ˜(DEV_BSIZE - 1); 2730 } 2792 desiredpages = (size == 0) ? 0 : 2731 return 1; 2793 num_pages((bp->b_offset & PAGE_MASK) + newbsize); 2732 } 2794 2733 vm_hold_free_pages( 2795 if (bp->b_flags & B_MALLOC) 2734 bp, 2796 panic("allocbuf: VMIO buffer can’t be malloced"); 2735 (vm_offset_t) bp->b_data + newbsize, 2797 /* 2736 (vm_offset_t) bp->b_data + bp->b_bufsize); 2798 * Set B_CACHE initially if buffer is 0 length or will become 2737 } else if (newbsize > bp->b_bufsize) { 2799 * 0-length. 2738 /* 2800 */ 2739 * We only use malloced memory on the first allocation 2801 if (size == 0 || bp->b_bufsize == 0) . 2802 bp->b_flags |= B_CACHE; 2740 * and revert to page-allocated memory when the buffer 2803 2741 * grows. 2804 if (newbsize < bp->b_bufsize) { 2742 */ 2805 /* 2743 /* 2806 * DEV_BSIZE aligned new buffer size is less then the 2744 * There is a potential smp race here that could lead 2807 * DEV_BSIZE aligned existing buffer size. Figure out 2745 * to bufmallocspace slightly passing the max. It 2808 * if we have to remove any pages. 2746 * is probably extremely rare and not worth worrying 2809 */ 2747 * over. 2810 if (desiredpages < bp->b_npages) { 2748 */ 2811 vm_page_t m; 2749 if ( (bufmallocspace < maxbufmallocspace) && 2812 2750 (bp->b_bufsize == 0) && 2813 vm_page_lock_queues(); 2751 (mbsize <= PAGE_SIZE/2)) { 2814 for (i = desiredpages; i < bp->b_npages; i++) 2752 { 2753 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK 2815 /* ); 2816 * the page is not freed here -- it 2754 bp->b_bufsize = mbsize; 2817 * is the responsibility of 2755 bp->b_bcount = size; 2818 * vnode_pager_setsize 11/15/03 01:28:09 sys/kern/vfs_bio.c 23 2819 */ 2879 bp->b_flags &= ˜B_CACHE; 2820 m = bp->b_pages[i]; 2880 bp->b_pages[bp->b_npages] = m; 2821 KASSERT(m != bogus_page, 2881 ++bp->b_npages; 2822 ("allocbuf: bogus page found")); 2882 } 2823 while (vm_page_sleep_if_busy(m, TRUE, 2883 continue; "biodep")) 2884 } 2824 vm_page_lock_queues(); 2885 2825 2886 /* 2826 bp->b_pages[i] = NULL; 2887 * We found a page. If we have to sleep on it 2827 vm_page_unwire(m, 0); , 2828 } 2888 * retry because it might have gotten freed ou 2829 vm_page_unlock_queues(); t 2830 pmap_qremove((vm_offset_t) trunc_page((vm_offs 2889 * from under us. et_t)bp->b_data) + 2890 * 2831 (desiredpages << PAGE_SHIFT), (bp->b_npage 2891 * We can only test PG_BUSY here. Blocking on s - desiredpages)); 2892 * m->busy might lead to a deadlock: 2832 bp->b_npages = desiredpages; 2893 * 2833 } 2894 * vm_fault->getpages->cluster_read->allocbuf 2834 } else if (size > bp->b_bcount) { 2895 * 2835 /* 2896 */ 2836 * We are growing the buffer, possibly in a 2897 vm_page_lock_queues(); 2837 * byte-granular fashion. 2898 if (vm_page_sleep_if_busy(m, FALSE, "pgtblk")) 2838 */ 2899 continue; 2839 struct vnode *vp; 2900 2840 vm_object_t obj; 2901 /* 2841 vm_offset_t toff; 2902 * We have a good page. Should we wakeup the 2842 vm_offset_t tinc; 2903 * page daemon? 2843 2904 */ 2844 /* 2905 if ((curproc != pageproc) && 2845 * Step 1, bring in the VM pages from the object, 2906 ((m->queue - m->pc) == PQ_CACHE) && 2846 * allocating them if necessary. We must clear 2907 ((cnt.v_free_count + cnt.v_cache_count) < 2847 * B_CACHE if these pages are not valid for the 2908 (cnt.v_free_min + cnt.v_cache_min))) { 2848 * range covered by the buffer. 2909 pagedaemon_wakeup(); 2849 */ 2910 } 2850 2911 vm_page_flag_clear(m, PG_ZERO); 2851 vp = bp->b_vp; 2912 vm_page_wire(m); 2852 obj = bp->b_object; 2913 vm_page_unlock_queues(); 2853 2914 bp->b_pages[bp->b_npages] = m; 2854 VM_OBJECT_LOCK(obj); 2915 ++bp->b_npages; 2855 while (bp->b_npages < desiredpages) { 2916 } 2856 vm_page_t m; 2917 2857 vm_pindex_t pi; 2918 /* 2858 2919 * Step 2. We’ve loaded the pages into the buffer, 2859 pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; 2920 * we have to figure out if we can still have B_CACHE 2860 if ((m = vm_page_lookup(obj, pi)) == NULL) { 2921 * set. Note that B_CACHE is set according to the 2861 /* 2922 * byte-granular range ( bcount and size ), new the 2862 * note: must allocate system pages 2923 * aligned range ( newbsize ). 2863 * since blocking here could intefere 2924 * 2864 * with paging I/O, no matter which 2925 * The VM test is against m->valid, which is DEV_BSIZE 2865 * process we are. 2926 * aligned. Needless to say, the validity of the data 2866 */ 2927 * needs to also be DEV_BSIZE aligned. Note that this 2867 m = vm_page_alloc(obj, pi, 2928 * fails with NFS if the server or some other client 2868 VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 2929 * extends the file’s EOF. If our buffer is resized, 2869 if (m == NULL) { 2930 * B_CACHE may remain set! XXX 2870 atomic_add_int(&vm_pageout_def 2931 */ icit, 2932 2871 desiredpages - bp->b_npage 2933 toff = bp->b_bcount; s); 2934 tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK) 2872 VM_OBJECT_UNLOCK(obj); ; 2873 VM_WAIT; 2935 2874 VM_OBJECT_LOCK(obj); 2936 while ((bp->b_flags & B_CACHE) && toff < size) { 2875 } else { 2937 vm_pindex_t pi; 2876 vm_page_lock_queues(); 2938 2877 vm_page_wakeup(m); 2939 if (tinc > (size - toff)) 2878 vm_page_unlock_queues(); 2940 tinc = size - toff; 11/15/03 01:28:09 sys/kern/vfs_bio.c 24 2941 3006 msleep(bp, &bdonelock, PRIBIO, wchan, hz / 10); 2942 pi = ((bp->b_offset & PAGE_MASK) + toff) >> 3007 mtx_unlock(&bdonelock); 2943 PAGE_SHIFT; 3008 if (bp->bio_error != 0) 2944 3009 return (bp->bio_error); 2945 vfs_buf_test_cache( 3010 if (!(bp->bio_flags & BIO_ERROR)) 2946 bp, 3011 return (0); 2947 bp->b_offset, 3012 return (EIO); 2948 toff, 3013 } 2949 tinc, 3014 2950 bp->b_pages[pi] 3015 void 2951 ); 3016 biofinish(struct bio *bp, struct devstat *stat, int error) 2952 toff += tinc; 3017 { 2953 tinc = PAGE_SIZE; 3018 2954 } 3019 if (error) { 2955 VM_OBJECT_UNLOCK(obj); 3020 bp->bio_error = error; 2956 3021 bp->bio_flags |= BIO_ERROR; 2957 /* 3022 } 2958 * Step 3, fixup the KVM pmap. Remember that 3023 if (stat != NULL) 2959 * bp->b_data is relative to bp->b_offset, but 3024 devstat_end_transaction_bio(stat, bp); 2960 * bp->b_offset may be offset into the first page. 3025 biodone(bp); 2961 */ 3026 } 2962 3027 2963 bp->b_data = (caddr_t) 3028 /* 2964 trunc_page((vm_offset_t)bp->b_data); 3029 * bufwait: 2965 pmap_qenter( 3030 * 2966 (vm_offset_t)bp->b_data, 3031 * Wait for buffer I/O completion, returning error status. The buffer 2967 bp->b_pages, 3032 * is left locked and B_DONE on return. B_EINTR is converted into an EIN 2968 bp->b_npages TR 2969 ); 3033 * error and cleared. 2970 3034 */ 2971 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 3035 int 2972 (vm_offset_t)(bp->b_offset & PAGE_MASK)); 3036 bufwait(register struct buf * bp) 2973 } 3037 { 2974 } 3038 int s; 2975 if (newbsize < bp->b_bufsize) 3039 2976 bufspacewakeup(); 3040 s = splbio(); 2977 bp->b_bufsize = newbsize; /* actual buffer allocation */ 3041 if (bp->b_iocmd == BIO_READ) 2978 bp->b_bcount = size; /* requested buffer size */ 3042 bwait(bp, PRIBIO, "biord"); 2979 return 1; 3043 else 2980 } 3044 bwait(bp, PRIBIO, "biowr"); 2981 3045 splx(s); 2982 void 3046 if (bp->b_flags & B_EINTR) { 2983 biodone(struct bio *bp) 3047 bp->b_flags &= ˜B_EINTR; 2984 { 3048 return (EINTR); 2985 mtx_lock(&bdonelock); 3049 } 2986 bp->bio_flags |= BIO_DONE; 3050 if (bp->b_ioflags & BIO_ERROR) { 2987 if (bp->bio_done == NULL) 3051 return (bp->b_error ? bp->b_error : EIO); 2988 wakeup(bp); 3052 } else { 2989 mtx_unlock(&bdonelock); 3053 return (0); 2990 if (bp->bio_done != NULL) 3054 } 2991 bp->bio_done(bp); 3055 } 2992 } 3056 2993 3057 /* 2994 /* 3058 * Call back function from struct bio back up to struct buf. 2995 * Wait for a BIO to finish. 3059 * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY(). 2996 * 3060 */ 2997 * XXX: resort to a timeout for now. The optimal locking (if any) for this 3061 static void 2998 * case is not yet clear. 3062 bufdonebio(struct bio *bp) 2999 */ 3063 { 3000 int 3064 3001 biowait(struct bio *bp, const char *wchan) 3065 /* Device drivers may or may not hold giant, hold it here. */ 3002 { 3066 mtx_lock(&Giant); 3003 3067 bufdone(bp->bio_caller2); 3004 mtx_lock(&bdonelock); 3068 mtx_unlock(&Giant); 3005 while ((bp->bio_flags & BIO_DONE) == 0) 3069 } 11/15/03 01:28:09 sys/kern/vfs_bio.c 25 3070 3134 } 3071 void 3135 if (LIST_FIRST(&bp->b_dep) != NULL) 3072 dev_strategy(struct buf *bp) 3136 buf_complete(bp); 3073 { 3137 3074 3138 if (bp->b_flags & B_VMIO) { 3075 if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1))) 3139 int i; 3076 panic("b_iocmd botch"); 3140 vm_ooffset_t foff; 3077 bp->b_io.bio_done = bufdonebio; 3141 vm_page_t m; 3078 bp->b_io.bio_caller2 = bp; 3142 vm_object_t obj; 3079 (*devsw(bp->b_io.bio_dev)->d_strategy)(&bp->b_io); 3143 int iosize; 3080 } 3144 struct vnode *vp = bp->b_vp; 3081 3145 3082 /* 3146 obj = bp->b_object; 3083 * bufdone: 3147 3084 * 3148 #if defined(VFS_BIO_DEBUG) 3085 * Finish I/O on a buffer, optionally calling a completion function. 3149 mp_fixme("usecount and vflag accessed without locks."); 3086 * This is usually called from an interrupt so process blocking is 3150 if (vp->v_usecount == 0) { 3087 * not allowed. 3151 panic("biodone: zero vnode ref count"); 3088 * 3152 } 3089 * biodone is also responsible for setting B_CACHE in a B_VMIO bp. 3153 3090 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 3154 if ((vp->v_vflag & VV_OBJBUF) == 0) { 3091 * assuming B_INVAL is clear. 3155 panic("biodone: vnode is not setup for merged cache"); 3092 * 3156 } 3093 * For the VMIO case, we set B_CACHE if the op was a read and no 3157 #endif 3094 * read error occured, or if the op was a write. B_CACHE is never 3158 3095 * set if the buffer is invalid or otherwise uncacheable. 3159 foff = bp->b_offset; 3096 * 3160 KASSERT(bp->b_offset != NOOFFSET, 3097 * biodone does not mess with B_INVAL, allowing the I/O routine or the 3161 ("biodone: no buffer offset")); 3098 * initiator to leave B_INVAL set to brelse the buffer out of existance 3162 3099 * in the biodone routine. 3163 VM_OBJECT_LOCK(obj); 3100 */ 3164 #if defined(VFS_BIO_DEBUG) 3101 void 3165 if (obj->paging_in_progress < bp->b_npages) { 3102 bufdone(struct buf *bp) 3166 printf("biodone: paging in progress(%d) < bp->b_npages 3103 { (%d)\n", 3104 int s; 3167 obj->paging_in_progress, bp->b_npages); 3105 void (*biodone)(struct buf *); 3168 } 3106 3169 #endif 3107 GIANT_REQUIRED; 3170 3108 3171 /* 3109 s = splbio(); 3172 * Set B_CACHE if the op was a normal read and no error 3110 3173 * occured. B_CACHE is set for writes in the b*write() 3111 KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REF 3174 * routines. CNT(bp))); 3175 */ 3112 KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); 3176 iosize = bp->b_bcount - bp->b_resid; 3113 3177 if (bp->b_iocmd == BIO_READ && 3114 bp->b_flags |= B_DONE; 3178 !(bp->b_flags & (B_INVAL|B_NOCACHE)) && 3115 runningbufwakeup(bp); 3179 !(bp->b_ioflags & BIO_ERROR)) { 3116 3180 bp->b_flags |= B_CACHE; 3117 if (bp->b_iocmd == BIO_DELETE) { 3181 } 3118 brelse(bp); 3182 vm_page_lock_queues(); 3119 splx(s); 3183 for (i = 0; i < bp->b_npages; i++) { 3120 return; 3184 int bogusflag = 0; 3121 } 3185 int resid; 3122 3186 3123 if (bp->b_iocmd == BIO_WRITE) { 3187 resid = ((foff + PAGE_SIZE) & ˜(off_t)PAGE_MASK) - fof 3124 vwakeup(bp); f; 3125 } 3188 if (resid > iosize) 3126 3189 resid = iosize; 3127 /* call optional completion function if requested */ 3190 3128 if (bp->b_iodone != NULL) { 3191 /* 3129 biodone = bp->b_iodone; 3192 * cleanup bogus pages, restoring the originals 3130 bp->b_iodone = NULL; 3193 */ 3131 (*biodone) (bp); 3194 m = bp->b_pages[i]; 3132 splx(s); 3195 if (m == bogus_page) { 3133 return; 3196 bogusflag = 1; 11/15/03 01:28:09 sys/kern/vfs_bio.c 26 3197 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 3253 VM_OBJECT_UNLOCK(obj); 3198 if (m == NULL) 3254 } 3199 panic("biodone: page disappeared!"); 3255 3200 bp->b_pages[i] = m; 3256 /* 3201 pmap_qenter(trunc_page((vm_offset_t)bp->b_data 3257 * For asynchronous completions, release the buffer now. The brelse ), bp->b_pages, bp->b_npages); 3258 * will do a wakeup there if necessary - so no need to do a wakeup 3202 } 3259 * here in the async case. The sync case always needs to do a wakeup. 3203 #if defined(VFS_BIO_DEBUG) 3260 */ 3204 if (OFF_TO_IDX(foff) != m->pindex) { 3261 3205 printf( 3262 if (bp->b_flags & B_ASYNC) { 3206 "biodone: foff(%jd)/m->pindex(%ju) mismatch\n", 3263 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b 3207 (intmax_t)foff, (uintmax_t)m->pindex); _ioflags & BIO_ERROR)) 3208 } 3264 brelse(bp); 3209 #endif 3265 else 3210 3266 bqrelse(bp); 3211 /* 3267 } else { 3212 * In the write case, the valid and clean bits are 3268 bdone(bp); 3213 * already changed correctly ( see bdwrite() ), so we 3269 } 3214 * only need to do this here in the read case. 3270 splx(s); 3215 */ 3271 } 3216 if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 3272 0) { 3273 /* 3217 vfs_page_set_valid(bp, foff, i, m); 3274 * This routine is called in lieu of iodone in the case of 3218 } 3275 * incomplete I/O. This keeps the busy status for pages 3219 vm_page_flag_clear(m, PG_ZERO); 3276 * consistant. 3220 3277 */ 3221 /* 3278 void 3222 * when debugging new filesystems or buffer I/O method 3279 vfs_unbusy_pages(struct buf * bp) s, this 3280 { 3223 * is the most common error that pops up. if you see 3281 int i; this, you 3282 3224 * have not set the page busy flag correctly!!! 3283 runningbufwakeup(bp); 3225 */ 3284 if (bp->b_flags & B_VMIO) { 3226 if (m->busy == 0) { 3285 vm_object_t obj; 3227 printf("biodone: page busy < 0, " 3286 3228 "pindex: %d, foff: 0x(%x,%x), " 3287 obj = bp->b_object; 3229 "resid: %d, index: %d\n", 3288 VM_OBJECT_LOCK(obj); 3230 (int) m->pindex, (int)(foff >> 32), 3289 vm_page_lock_queues(); 3231 (int) foff & 0xffffffff, resid 3290 for (i = 0; i < bp->b_npages; i++) { , i); 3291 vm_page_t m = bp->b_pages[i]; 3232 if (!vn_isdisk(vp, NULL)) 3292 3233 printf(" iosize: %jd, lblkno: %jd, fla 3293 if (m == bogus_page) { gs: 0x%x, npages: %d\n", 3294 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offse 3234 (intmax_t)bp->b_vp->v_mount->mnt_s t) + i); tat.f_iosize, 3295 if (!m) { 3235 (intmax_t) bp->b_lblkno, 3296 panic("vfs_unbusy_pages: page missing\ 3236 bp->b_flags, bp->b_npages); n"); 3237 else 3297 } 3238 printf(" VDEV, lblkno: %jd, flags: 0x% 3298 bp->b_pages[i] = m; x, npages: %d\n", 3299 pmap_qenter(trunc_page((vm_offset_t)bp->b_data 3239 (intmax_t) bp->b_lblkno, ), bp->b_pages, bp->b_npages); 3240 bp->b_flags, bp->b_npages); 3300 } 3241 printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d 3301 vm_object_pip_subtract(obj, 1); \n", 3302 vm_page_flag_clear(m, PG_ZERO); 3242 (u_long)m->valid, (u_long)m->dirty, 3303 vm_page_io_finish(m); 3243 m->wire_count); 3304 } 3244 panic("biodone: page busy < 0\n"); 3305 vm_page_unlock_queues(); 3245 } 3306 vm_object_pip_wakeupn(obj, 0); 3246 vm_page_io_finish(m); 3307 VM_OBJECT_UNLOCK(obj); 3247 vm_object_pip_subtract(obj, 1); 3308 } 3248 foff = (foff + PAGE_SIZE) & ˜(off_t)PAGE_MASK; 3309 } 3249 iosize -= resid; 3310 3250 } 3311 /* 3251 vm_page_unlock_queues(); 3312 * vfs_page_set_valid: 3252 vm_object_pip_wakeupn(obj, 0); 3313 * 11/15/03 01:28:09 sys/kern/vfs_bio.c 27 3314 * Set the valid bits in a page based on the supplied offset. The 3379 vm_page_t m = bp->b_pages[i]; 3315 * range is restricted to the buffer’s size. 3380 3316 * 3381 if (vm_page_sleep_if_busy(m, FALSE, "vbpage")) 3317 * This routine is typically called after a read completes. 3382 goto retry; 3318 */ 3383 } 3319 static void 3384 bogus = 0; 3320 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) 3385 for (i = 0; i < bp->b_npages; i++) { 3321 { 3386 vm_page_t m = bp->b_pages[i]; 3322 vm_ooffset_t soff, eoff; 3387 3323 3388 vm_page_flag_clear(m, PG_ZERO); 3324 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3389 if ((bp->b_flags & B_CLUSTER) == 0) { 3325 /* 3390 vm_object_pip_add(obj, 1); 3326 * Start and end offsets in buffer. eoff - soff may not cross a 3391 vm_page_io_start(m); 3327 * page boundry or cross the end of the buffer. The end of the 3392 } 3328 * buffer, in this case, is our file EOF, not the allocation size 3393 /* 3329 * of the buffer. 3394 * When readying a buffer for a read ( i.e 3330 */ 3395 * clear_modify == 0 ), it is important to do 3331 soff = off; 3396 * bogus_page replacement for valid pages in 3332 eoff = (off + PAGE_SIZE) & ˜(off_t)PAGE_MASK; 3397 * partially instantiated buffers. Partially 3333 if (eoff > bp->b_offset + bp->b_bcount) 3398 * instantiated buffers can, in turn, occur when 3334 eoff = bp->b_offset + bp->b_bcount; 3399 * reconstituting a buffer from its VM backing store 3335 3400 * base. We only have to do this if B_CACHE is 3336 /* 3401 * clear ( which causes the I/O to occur in the 3337 * Set valid range. This is typically the entire buffer and thus the 3402 * first place ). The replacement prevents the read 3338 * entire page. 3403 * I/O from overwriting potentially dirty VM-backed 3339 */ 3404 * pages. XXX bogus page replacement is, uh, bogus. 3340 if (eoff > soff) { 3405 * It may not work properly with small-block devices. 3341 vm_page_set_validclean( 3406 * We need to find a better way. 3342 m, 3407 */ 3343 (vm_offset_t) (soff & PAGE_MASK), 3408 pmap_remove_all(m); 3344 (vm_offset_t) (eoff - soff) 3409 if (clear_modify) 3345 ); 3410 vfs_page_set_valid(bp, foff, i, m); 3346 } 3411 else if (m->valid == VM_PAGE_BITS_ALL && 3347 } 3412 (bp->b_flags & B_CACHE) == 0) { 3348 3413 bp->b_pages[i] = bogus_page; 3349 /* 3414 bogus++; 3350 * This routine is called before a device strategy routine. 3415 } 3351 * It is used to tell the VM system that paging I/O is in 3416 foff = (foff + PAGE_SIZE) & ˜(off_t)PAGE_MASK; 3352 * progress, and treat the pages associated with the buffer 3417 } 3353 * almost as being PG_BUSY. Also the object paging_in_progress 3418 vm_page_unlock_queues(); 3354 * flag is handled to make sure that the object doesn’t become 3419 VM_OBJECT_UNLOCK(obj); 3355 * inconsistant. 3420 if (bogus) 3356 * 3421 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b 3357 * Since I/O has not been initiated yet, certain buffer flags _pages, bp->b_npages); 3358 * such as BIO_ERROR or B_INVAL may be in an inconsistant state 3422 } 3359 * and should be ignored. 3423 } 3360 */ 3424 3361 void 3425 /* 3362 vfs_busy_pages(struct buf * bp, int clear_modify) 3426 * Tell the VM system that the pages associated with this buffer 3363 { 3427 * are clean. This is used for delayed writes where the data is 3364 int i, bogus; 3428 * going to go to disk eventually without additional VM intevention. 3365 3429 * 3366 if (bp->b_flags & B_VMIO) { 3430 * Note that while we only really need to clean through to b_bcount, we 3367 vm_object_t obj; 3431 * just go ahead and clean through to b_bufsize. 3368 vm_ooffset_t foff; 3432 */ 3369 3433 static void 3370 obj = bp->b_object; 3434 vfs_clean_pages(struct buf * bp) 3371 foff = bp->b_offset; 3435 { 3372 KASSERT(bp->b_offset != NOOFFSET, 3436 int i; 3373 ("vfs_busy_pages: no buffer offset")); 3437 3374 vfs_setdirty(bp); 3438 if (bp->b_flags & B_VMIO) { 3375 VM_OBJECT_LOCK(obj); 3439 vm_ooffset_t foff; 3376 retry: 3440 3377 vm_page_lock_queues(); 3441 foff = bp->b_offset; 3378 for (i = 0; i < bp->b_npages; i++) { 3442 KASSERT(bp->b_offset != NOOFFSET, 11/15/03 01:28:09 sys/kern/vfs_bio.c 28 3443 ("vfs_clean_pages: no buffer offset")); 3505 /* 3444 VM_OBJECT_LOCK(bp->b_object); 3506 * vfs_bio_clrbuf: 3445 vm_page_lock_queues(); 3507 * 3446 for (i = 0; i < bp->b_npages; i++) { 3508 * clear a buffer. This routine essentially fakes an I/O, so we need 3447 vm_page_t m = bp->b_pages[i]; 3509 * to clear BIO_ERROR and B_INVAL. 3448 vm_ooffset_t noff = (foff + PAGE_SIZE) & ˜(off_t)PAGE_ 3510 * MASK; 3511 * Note that while we only theoretically need to clear through b_bcount, 3449 vm_ooffset_t eoff = noff; 3512 * we go ahead and clear through b_bufsize. 3450 3513 */ 3451 if (eoff > bp->b_offset + bp->b_bufsize) 3514 3452 eoff = bp->b_offset + bp->b_bufsize; 3515 void 3453 vfs_page_set_valid(bp, foff, i, m); 3516 vfs_bio_clrbuf(struct buf *bp) 3454 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - fof 3517 { f); */ 3518 int i, mask = 0; 3455 foff = noff; 3519 caddr_t sa, ea; 3456 } 3520 3457 vm_page_unlock_queues(); 3521 GIANT_REQUIRED; 3458 VM_OBJECT_UNLOCK(bp->b_object); 3522 3459 } 3523 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { 3460 } 3524 bp->b_flags &= ˜B_INVAL; 3461 3525 bp->b_ioflags &= ˜BIO_ERROR; 3462 /* 3526 VM_OBJECT_LOCK(bp->b_object); 3463 * vfs_bio_set_validclean: 3527 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 3464 * 3528 (bp->b_offset & PAGE_MASK) == 0) { 3465 * Set the range within the buffer to valid and clean. The range is 3529 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 3466 * relative to the beginning of the buffer, b_offset. Note that b_offset 3530 if (bp->b_pages[0] != bogus_page) 3467 * itself may be offset from the beginning of the first page. 3531 VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, 3468 * MA_OWNED); 3469 */ 3532 if ((bp->b_pages[0]->valid & mask) == mask) 3470 3533 goto unlock; 3471 void 3534 if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && 3472 vfs_bio_set_validclean(struct buf *bp, int base, int size) 3535 ((bp->b_pages[0]->valid & mask) == 0)) { 3473 { 3536 bzero(bp->b_data, bp->b_bufsize); 3474 if (bp->b_flags & B_VMIO) { 3537 bp->b_pages[0]->valid |= mask; 3475 int i; 3538 goto unlock; 3476 int n; 3539 } 3477 3540 } 3478 /* 3541 ea = sa = bp->b_data; 3479 * Fixup base to be relative to beginning of first page. 3542 for(i=0;ib_npages;i++,sa=ea) { 3480 * Set initial n to be the maximum number of bytes in the 3543 int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 3481 * first page that can be validated. 3544 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 3482 */ 3545 ea = (caddr_t)(vm_offset_t)ulmin( 3483 3546 (u_long)(vm_offset_t)ea, 3484 base += (bp->b_offset & PAGE_MASK); 3547 (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 3485 n = PAGE_SIZE - (base & PAGE_MASK); 3548 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 3486 3549 if (bp->b_pages[i] != bogus_page) 3487 VM_OBJECT_LOCK(bp->b_object); 3550 VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, 3488 vm_page_lock_queues(); MA_OWNED); 3489 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) 3551 if ((bp->b_pages[i]->valid & mask) == mask) { 3552 continue; 3490 vm_page_t m = bp->b_pages[i]; 3553 if ((bp->b_pages[i]->valid & mask) == 0) { 3491 3554 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { 3492 if (n > size) 3555 bzero(sa, ea - sa); 3493 n = size; 3556 } 3494 3557 } else { 3495 vm_page_set_validclean(m, base & PAGE_MASK, n); 3558 for (; sa < ea; sa += DEV_BSIZE, j++) { 3496 base += n; 3559 if (((bp->b_pages[i]->flags & PG_ZERO) 3497 size -= n; == 0) && 3498 n = PAGE_SIZE; 3560 (bp->b_pages[i]->valid & (1<b_object); 3562 } 3502 } 3563 } 3503 } 3564 bp->b_pages[i]->valid |= mask; 3504 3565 vm_page_lock_queues(); 11/15/03 01:28:09 sys/kern/vfs_bio.c 29 3566 vm_page_flag_clear(bp->b_pages[i], PG_ZERO); 3631 GIANT_REQUIRED; 3567 vm_page_unlock_queues(); 3632 3568 } 3633 from = round_page(from); 3569 unlock: 3634 to = round_page(to); 3570 VM_OBJECT_UNLOCK(bp->b_object); 3635 newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PA 3571 bp->b_resid = 0; GE_SHIFT; 3572 } else { 3636 3573 clrbuf(bp); 3637 VM_OBJECT_LOCK(kernel_object); 3574 } 3638 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3575 } 3639 p = bp->b_pages[index]; 3576 3640 if (p && (index < bp->b_npages)) { 3577 /* 3641 if (p->busy) { 3578 * vm_hold_load_pages and vm_hold_free_pages get pages into 3642 printf( 3579 * a buffers address space. The pages are anonymous and are 3643 "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", 3580 * not associated with a file object. 3644 (intmax_t)bp->b_blkno, 3581 */ 3645 (intmax_t)bp->b_lblkno); 3582 static void 3646 } 3583 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 3647 bp->b_pages[index] = NULL; 3584 { 3648 pmap_qremove(pg, 1); 3585 vm_offset_t pg; 3649 vm_page_lock_queues(); 3586 vm_page_t p; 3650 vm_page_busy(p); 3587 int index; 3651 vm_page_unwire(p, 0); 3588 3652 vm_page_free(p); 3589 to = round_page(to); 3653 vm_page_unlock_queues(); 3590 from = round_page(from); 3654 } 3591 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3655 } 3592 3656 VM_OBJECT_UNLOCK(kernel_object); 3593 VM_OBJECT_LOCK(kernel_object); 3657 bp->b_npages = newnpages; 3594 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3658 } 3595 tryagain: 3659 3596 /* 3660 /* 3597 * note: must allocate system pages since blocking here 3661 * Map an IO request into kernel virtual address space. 3598 * could intefere with paging I/O, no matter which 3662 * 3599 * process we are. 3663 * All requests are (re)mapped into kernel VA space. 3600 */ 3664 * Notice that we use b_bufsize for the size of the buffer 3601 p = vm_page_alloc(kernel_object, 3665 * to be mapped. b_bcount might be modified by the driver. 3602 ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 3666 * 3603 VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 3667 * Note that even if the caller determines that the address space should 3604 if (!p) { 3668 * be valid, a race or a smaller-file mapped into a larger space may 3605 atomic_add_int(&vm_pageout_deficit, 3669 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST 3606 (to - pg) >> PAGE_SHIFT); 3670 * check the return value. 3607 VM_OBJECT_UNLOCK(kernel_object); 3671 */ 3608 VM_WAIT; 3672 int 3609 VM_OBJECT_LOCK(kernel_object); 3673 vmapbuf(struct buf *bp) 3610 goto tryagain; 3674 { 3611 } 3675 caddr_t addr, kva; 3612 p->valid = VM_PAGE_BITS_ALL; 3676 vm_prot_t prot; 3613 pmap_qenter(pg, &p, 1); 3677 int pidx, i; 3614 bp->b_pages[index] = p; 3678 struct vm_page *m; 3615 vm_page_lock_queues(); 3679 struct pmap *pmap = &curproc->p_vmspace->vm_pmap; 3616 vm_page_wakeup(p); 3680 3617 vm_page_unlock_queues(); 3681 GIANT_REQUIRED; 3618 } 3682 3619 VM_OBJECT_UNLOCK(kernel_object); 3683 if (bp->b_bufsize < 0) 3620 bp->b_npages = index; 3684 return (-1); 3621 } 3685 prot = (bp->b_iocmd == BIO_READ) ? VM_PROT_READ | VM_PROT_WRITE : 3622 3686 VM_PROT_READ; 3623 /* Return pages associated with this buf to the vm system */ 3687 for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0; 3624 static void 3688 addr < bp->b_data + bp->b_bufsize; 3625 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 3689 addr += PAGE_SIZE, pidx++) { 3626 { 3690 /* 3627 vm_offset_t pg; 3691 * Do the vm_fault if needed; do the copy-on-write thing 3628 vm_page_t p; 3692 * when reading stuff off device into memory. 3629 int index, newnpages; 3693 * 3630 3694 * NOTE! Must use pmap_extract() because addr may be in 11/15/03 01:28:09 sys/kern/vfs_bio.c 30 3695 * the userland address space, and kextract is only guarenteed 3760 mtx_lock(&bdonelock); 3696 * to work for the kernland address space (see: sparc64 port). 3761 while ((bp->b_flags & B_DONE) == 0) 3697 */ 3762 msleep(bp, &bdonelock, pri, wchan, 0); 3698 retry: 3763 mtx_unlock(&bdonelock); 3699 if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data, 3764 } 3700 prot) < 0) { 3765 3701 vm_page_lock_queues(); 3766 #include "opt_ddb.h" 3702 for (i = 0; i < pidx; ++i) { 3767 #ifdef DDB 3703 vm_page_unhold(bp->b_pages[i]); 3768 #include 3704 bp->b_pages[i] = NULL; 3769 3705 } 3770 /* DDB command to show buffer data */ 3706 vm_page_unlock_queues(); 3771 DB_SHOW_COMMAND(buffer, db_show_buffer) 3707 return(-1); 3772 { 3708 } 3773 /* get args */ 3709 m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot); 3774 struct buf *bp = (struct buf *)addr; 3710 if (m == NULL) 3775 3711 goto retry; 3776 if (!have_addr) { 3712 bp->b_pages[pidx] = m; 3777 db_printf("usage: show buffer \n"); 3713 } 3778 return; 3714 if (pidx > btoc(MAXPHYS)) 3779 } 3715 panic("vmapbuf: mapped more than MAXPHYS"); 3780 3716 pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); 3781 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 3717 3782 db_printf( 3718 kva = bp->b_saveaddr; 3783 "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" 3719 bp->b_npages = pidx; 3784 "b_dev = (%d,%d), b_data = %p, b_blkno = %jd\n", 3720 bp->b_saveaddr = bp->b_data; 3785 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 3721 bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); 3786 major(bp->b_dev), minor(bp->b_dev), bp->b_data, 3722 return(0); 3787 (intmax_t)bp->b_blkno); 3723 } 3788 if (bp->b_npages) { 3724 3789 int i; 3725 /* 3790 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages 3726 * Free the io map PTEs associated with this IO operation. ); 3727 * We also invalidate the TLB entries and restore the original b_addr. 3791 for (i = 0; i < bp->b_npages; i++) { 3728 */ 3792 vm_page_t m; 3729 void 3793 m = bp->b_pages[i]; 3730 vunmapbuf(struct buf *bp) 3794 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 3731 { 3795 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 3732 int pidx; 3796 if ((i + 1) < bp->b_npages) 3733 int npages; 3797 db_printf(","); 3734 3798 } 3735 GIANT_REQUIRED; 3799 db_printf("\n"); 3736 3800 } 3737 npages = bp->b_npages; 3801 } 3738 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), 3802 #endif /* DDB */ 3739 npages); 3740 vm_page_lock_queues(); 3741 for (pidx = 0; pidx < npages; pidx++) 3742 vm_page_unhold(bp->b_pages[pidx]); 3743 vm_page_unlock_queues(); 3744 3745 bp->b_data = bp->b_saveaddr; 3746 } 3747 3748 void 3749 bdone(struct buf *bp) 3750 { 3751 mtx_lock(&bdonelock); 3752 bp->b_flags |= B_DONE; 3753 wakeup(bp); 3754 mtx_unlock(&bdonelock); 3755 } 3756 3757 void 3758 bwait(struct buf *bp, u_char pri, const char *wchan) 3759 { 11/12/03 00:01:39 sys/kern/vfs_cluster.c 1 1 /*- 622 if (vp->v_type == VREG) { 2 * Copyright (c) 1993 623 async = vp->v_mount->mnt_flag & MNT_ASYNC; 3 * The Regents of the University of California. All rights reserved. 624 lblocksize = vp->v_mount->mnt_stat.f_iosize; 4 * Modifications/enhancements: 625 } else { 5 * Copyright (c) 1995 John S. Dyson. All rights reserved. 626 async = 0; 6 * 627 lblocksize = bp->b_bufsize; 7 * Redistribution and use in source and binary forms, with or without 628 } 8 * modification, are permitted provided that the following conditions 629 lbn = bp->b_lblkno; 9 * are met: 630 KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")) 10 * 1. Redistributions of source code must retain the above copyright ; 11 * notice, this list of conditions and the following disclaimer. 631 12 * 2. Redistributions in binary form must reproduce the above copyright 632 /* Initialize vnode to beginning of file. */ 13 * notice, this list of conditions and the following disclaimer in the 633 if (lbn == 0) 14 * documentation and/or other materials provided with the distribution. 634 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 15 * 3. All advertising materials mentioning features or use of this software 635 16 * must display the following acknowledgement: 636 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 17 * This product includes software developed by the University of 637 (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 18 * California, Berkeley and its contributors. 638 maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; 19 * 4. Neither the name of the University nor the names of its contributors 639 if (vp->v_clen != 0) { 20 * may be used to endorse or promote products derived from this software 640 /* 21 * without specific prior written permission. 641 * Next block is not sequential. 22 * 642 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 643 * If we are not writing at end of file, the process 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 644 * seeked to another point in the file since its last 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 645 * write, or we have reached our maximum cluster size, 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 646 * then push the previous cluster. Otherwise try 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 647 * reallocating to make it sequential. 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 648 * 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 649 * Change to algorithm: only push previous cluster if 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 650 * it was sequential from the point of view of the 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 651 * seqcount heuristic, otherwise leave the buffer 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 652 * intact so we can potentially optimize the I/O 33 * SUCH DAMAGE. 653 * later on in the buf_daemon or update daemon 34 * 654 * flush. 35 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 655 */ 36 */ 656 cursize = vp->v_lastw - vp->v_cstart + 1; 37 657 if (((u_quad_t) bp->b_offset + lblocksize) != filesize 38 #include || 39 __FBSDID("$FreeBSD: src/sys/kern/vfs_cluster.c,v 1.148 2003/11/12 08:01:39 mck 658 lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { usick Exp $"); 659 if (!async && seqcount > 0) { 660 cluster_wbuild_wb(vp, lblocksize, 661 vp->v_cstart, cursize); 599 /* 662 } 600 * Do clustered write for FFS. 663 } else { 601 * 664 struct buf **bpp, **endbp; 602 * Three cases: 665 struct cluster_save *buflist; 603 * 1. Write is not sequential (write asynchronously) 666 604 * Write is sequential: 667 buflist = cluster_collectbufs(vp, bp); 605 * 2. beginning of cluster - begin cluster 668 endbp = &buflist->bs_children 606 * 3. middle of a cluster - add to cluster 669 [buflist->bs_nchildren - 1]; 607 * 4. end of a cluster - asynchronously write cluster 670 if (VOP_REALLOCBLKS(vp, buflist)) { 608 */ 671 /* 609 void 672 * Failed, push the previous cluster 610 cluster_write(bp, filesize, seqcount) 673 * if *really* writing sequentially 611 struct buf *bp; 674 * in the logical file (seqcount > 1), 612 u_quad_t filesize; 675 * otherwise delay it in the hopes tha 613 int seqcount; t 614 { 676 * the low level disk driver can 615 struct vnode *vp; 677 * optimize the write ordering. 616 daddr_t lbn; 678 */ 617 int maxclen, cursize; 679 for (bpp = buflist->bs_children; 618 int lblocksize; 680 bpp < endbp; bpp++) 619 int async; 681 brelse(*bpp); 620 682 free(buflist, M_SEGMENT); 621 vp = bp->b_vp; 683 if (seqcount > 1) { 11/12/03 00:01:39 sys/kern/vfs_cluster.c 2 684 cluster_wbuild_wb(vp, 748 } 685 lblocksize, vp->v_cstart, 749 vp->v_lastw = lbn; 686 cursize); 750 vp->v_lasta = bp->b_blkno; 687 } 751 } 688 } else { 752 689 /* 753 690 * Succeeded, keep building cluster. 754 /* 691 */ 755 * This is an awful lot like cluster_rbuild...wish they could be combined. 692 for (bpp = buflist->bs_children; 756 * The last lbn argument is the current block on which I/O is being 693 bpp <= endbp; bpp++) 757 * performed. Check to see that it doesn’t fall in the middle of 694 bdwrite(*bpp); 758 * the current block (if last_bp == NULL). 695 free(buflist, M_SEGMENT); 759 */ 696 vp->v_lastw = lbn; 760 int 697 vp->v_lasta = bp->b_blkno; 761 cluster_wbuild(vp, size, start_lbn, len) 698 return; 762 struct vnode *vp; 699 } 763 long size; 700 } 764 daddr_t start_lbn; 701 } 765 int len; 702 /* 766 { 703 * Consider beginning a cluster. If at end of file, make 767 struct buf *bp, *tbp; 704 * cluster as large as possible, otherwise find size of 768 int i, j, s; 705 * existing cluster. 769 int totalwritten = 0; 706 */ 770 int dbsize = btodb(size); 707 if ((vp->v_type == VREG) && 771 708 ((u_quad_t) bp->b_offset + lblocksize) != filesize && 772 GIANT_REQUIRED; 709 (bp->b_blkno == bp->b_lblkno) && 773 710 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 774 while (len > 0) { 711 bp->b_blkno == -1)) { 775 s = splbio(); 712 bawrite(bp); 776 /* 713 vp->v_clen = 0; 777 * If the buffer is not delayed-write (i.e. dirty), or it 714 vp->v_lasta = bp->b_blkno; 778 * is delayed-write but either locked or inval, it cannot 715 vp->v_cstart = lbn + 1; 779 * partake in the clustered write. 716 vp->v_lastw = lbn; 780 */ 717 return; 781 VI_LOCK(vp); 718 } 782 if ((tbp = gbincore(vp, start_lbn)) == NULL || 719 vp->v_clen = maxclen; 783 (tbp->b_vflags & BV_BKGRDINPROG)) { 720 if (!async && maxclen == 0) { /* I/O not contiguous */ 784 VI_UNLOCK(vp); 721 vp->v_cstart = lbn + 1; 785 ++start_lbn; 722 bawrite(bp); 786 --len; 723 } else { /* Wait for rest of cluster */ 787 splx(s); 724 vp->v_cstart = lbn; 788 continue; 725 bdwrite(bp); 789 } 726 } 790 if (BUF_LOCK(tbp, 727 } else if (lbn == vp->v_cstart + vp->v_clen) { 791 LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, VI_MTX(vp))) { 728 /* 792 ++start_lbn; 729 * At end of cluster, write it out if seqcount tells us we 793 --len; 730 * are operating sequentially, otherwise let the buf or 794 splx(s); 731 * update daemon handle it. 795 continue; 732 */ 796 } 733 bdwrite(bp); 797 if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { 734 if (seqcount > 1) 798 BUF_UNLOCK(tbp); 735 cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_ 799 ++start_lbn; clen + 1); 800 --len; 736 vp->v_clen = 0; 801 splx(s); 737 vp->v_cstart = lbn + 1; 802 continue; 738 } else if (vm_page_count_severe()) { 803 } 739 /* 804 bremfree(tbp); 740 * We are low on memory, get it going NOW 805 tbp->b_flags &= ˜B_DONE; 741 */ 806 splx(s); 742 bawrite(bp); 807 743 } else { 808 /* 744 /* 809 * Extra memory in the buffer, punt on this buffer. 745 * In the middle of a cluster, so just delay the I/O for now. 810 * XXX we could handle this in most cases, but we would 746 */ 811 * have to push the extra memory down to after our max 747 bdwrite(bp); 812 * possible cluster size and then potentially pull it back 11/12/03 00:01:39 sys/kern/vfs_cluster.c 3 813 * up if the cluster was terminated prematurely--too much 878 /* 814 * hassle. 879 * If it IS in core, but has different 815 */ 880 * characteristics, or is locked (which 816 if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 881 * means it could be undergoing a background 817 (B_CLUSTEROK | B_VMIO)) || 882 * I/O or be in a weird state), then don’t 818 (tbp->b_bcount != tbp->b_bufsize) || 883 * cluster with it. 819 (tbp->b_bcount != size) || 884 */ 820 (len == 1) || 885 if (BUF_LOCK(tbp, 821 ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { 886 LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, 822 totalwritten += tbp->b_bufsize; 887 VI_MTX(vp))) { 823 bawrite(tbp); 888 splx(s); 824 ++start_lbn; 889 break; 825 --len; 890 } 826 continue; 891 827 } 892 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 828 893 B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 829 /* 894 != (B_DELWRI | B_CLUSTEROK | 830 * We got a pbuf to make the cluster in. 895 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) | 831 * so initialise it. | 832 */ 896 tbp->b_wcred != bp->b_wcred) { 833 TAILQ_INIT(&bp->b_cluster.cluster_head); 897 BUF_UNLOCK(tbp); 834 bp->b_bcount = 0; 898 splx(s); 835 bp->b_magic = tbp->b_magic; 899 break; 836 bp->b_op = tbp->b_op; 900 } 837 bp->b_bufsize = 0; 901 838 bp->b_npages = 0; 902 /* 839 if (tbp->b_wcred != NOCRED) 903 * Check that the combined cluster 840 bp->b_wcred = crhold(tbp->b_wcred); 904 * would make sense with regard to pages 841 905 * and would not be too large 842 bp->b_blkno = tbp->b_blkno; 906 */ 843 bp->b_lblkno = tbp->b_lblkno; 907 if ((tbp->b_bcount != size) || 844 bp->b_offset = tbp->b_offset; 908 ((bp->b_blkno + (dbsize * i)) != 845 909 tbp->b_blkno) || 846 /* 910 ((tbp->b_npages + bp->b_npages) > 847 * We are synthesizing a buffer out of vm_page_t’s, but 911 (vp->v_mount->mnt_iosize_max / PAGE_SIZE)) 848 * if the block size is not page aligned then the starting ) { 849 * address may not be either. Inherit the b_data offset 912 BUF_UNLOCK(tbp); 850 * from the original buffer. 913 splx(s); 851 */ 914 break; 852 bp->b_data = (char *)((vm_offset_t)bp->b_data | 915 } 853 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 916 /* 854 bp->b_flags |= B_CLUSTER | 917 * Ok, it’s passed all the tests, 855 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 918 * so remove it from the free list 856 bp->b_iodone = cluster_callback; 919 * and mark it busy. We will use it. 857 pbgetvp(vp, bp); 920 */ 858 /* 921 bremfree(tbp); 859 * From this location in the file, scan forward to see 922 tbp->b_flags &= ˜B_DONE; 860 * if there are buffers with adjacent data that need to 923 splx(s); 861 * be written as well. 924 } /* end of code for non-first buffers only */ 862 */ 925 /* check for latent dependencies to be handled */ 863 for (i = 0; i < len; ++i, ++start_lbn) { 926 if ((LIST_FIRST(&tbp->b_dep)) != NULL) { 864 if (i != 0) { /* If not the first buffer */ 927 tbp->b_iocmd = BIO_WRITE; 865 s = splbio(); 928 buf_start(tbp); 866 /* 929 } 867 * If the adjacent data is not even in core it 930 /* 868 * can’t need to be written. 931 * If the IO is via the VM then we do some 869 */ 932 * special VM hackery (yuck). Since the buffer’s 870 VI_LOCK(vp); 933 * block size may not be page-aligned it is possible 871 if ((tbp = gbincore(vp, start_lbn)) == NULL || 934 * for a page to be shared between two buffers. We 872 (tbp->b_vflags & BV_BKGRDINPROG)) { 935 * have to get rid of the duplication when building 873 VI_UNLOCK(vp); 936 * the cluster. 874 splx(s); 937 */ 875 break; 938 if (tbp->b_flags & B_VMIO) { 876 } 939 vm_page_t m; 877 940 11/12/03 00:01:39 sys/kern/vfs_cluster.c 4 941 if (i != 0) { /* if not first buffer */ 942 for (j = 0; j < tbp->b_npages; j += 1) { 943 m = tbp->b_pages[j]; 944 if (m->flags & PG_BUSY) { 945 bqrelse(tbp); 946 goto finishcluster; 947 } 948 } 949 } 950 if (tbp->b_object != NULL) 951 VM_OBJECT_LOCK(tbp->b_object); 952 vm_page_lock_queues(); 953 for (j = 0; j < tbp->b_npages; j += 1) { 954 m = tbp->b_pages[j]; 955 vm_page_io_start(m); 956 vm_object_pip_add(m->object, 1); 957 if ((bp->b_npages == 0) || 958 (bp->b_pages[bp->b_npages - 1] != m) ) { 959 bp->b_pages[bp->b_npages] = m; 960 bp->b_npages++; 961 } 962 } 963 vm_page_unlock_queues(); 964 if (tbp->b_object != NULL) 965 VM_OBJECT_UNLOCK(tbp->b_object); 966 } 967 bp->b_bcount += size; 968 bp->b_bufsize += size; 969 970 s = splbio(); 971 bundirty(tbp); 972 tbp->b_flags &= ˜B_DONE; 973 tbp->b_ioflags &= ˜BIO_ERROR; 974 tbp->b_flags |= B_ASYNC; 975 tbp->b_iocmd = BIO_WRITE; 976 reassignbuf(tbp, tbp->b_vp); /* put on clean list * / 977 VI_LOCK(tbp->b_vp); 978 ++tbp->b_vp->v_numoutput; 979 VI_UNLOCK(tbp->b_vp); 980 splx(s); 981 BUF_KERNPROC(tbp); 982 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 983 tbp, b_cluster.cluster_entry); 984 } 985 finishcluster: 986 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 987 (vm_page_t *) bp->b_pages, bp->b_npages); 988 if (bp->b_bufsize > bp->b_kvasize) 989 panic( 990 "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n" , 991 bp->b_bufsize, bp->b_kvasize); 992 bp->b_kvasize = bp->b_bufsize; 993 totalwritten += bp->b_bufsize; 994 bp->b_dirtyoff = 0; 995 bp->b_dirtyend = bp->b_bufsize; 996 bawrite(bp); 997 998 len -= i; 999 } 1000 return totalwritten; 1001 } 11/04/03 20:30:07 sys/kern/vfs_default.c 1 1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed 6 * to Berkeley by John Heidemann of the UCLA Ficus project. 7 * 8 * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include 40 __FBSDID("$FreeBSD: src/sys/kern/vfs_default.c,v 1.91 2003/11/05 04:30:07 kan Exp $");

648 /* 649 * Return the underlying VM object. This routine may be called with or 650 * without the vnode interlock held. If called without, the returned 651 * object is not guarenteed to be valid. The syncer typically gets the 652 * object without holding the interlock in order to quickly test whether 653 * it might be dirty before going heavy-weight. vm_object’s use zalloc 654 * and thus stable-storage, so this is safe. 655 */ 656 int 657 vop_stdgetvobject(ap) 658 struct vop_getvobject_args /* { 659 struct vnode *vp; 660 struct vm_object **objpp; 661 } */ *ap; 662 { 663 struct vnode *vp = ap->a_vp; 664 struct vm_object **objpp = ap->a_objpp; 665 666 if (objpp) 667 *objpp = vp->v_object; 668 return (vp->v_object ? 0 : EINVAL); 669 } 10/04/03 07:35:22 sys/kern/vfs_vnops.c 1 1 /* 566 bwillwrite(); 2 * Copyright (c) 1982, 1986, 1989, 1993 567 ioflag = IO_UNIT; 3 * The Regents of the University of California. All rights reserved. 568 if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) 4 * (c) UNIX System Laboratories, Inc. 569 ioflag |= IO_APPEND; 5 * All or some portions of this file are derived from material licensed 570 if (fp->f_flag & FNONBLOCK) 6 * to the University of California by American Telephone and Telegraph 571 ioflag |= IO_NDELAY; 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 572 if (fp->f_flag & O_DIRECT) 8 * the permission of UNIX System Laboratories, Inc. 573 ioflag |= IO_DIRECT; 9 * 574 if ((fp->f_flag & O_FSYNC) || 10 * Redistribution and use in source and binary forms, with or without 575 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) 11 * modification, are permitted provided that the following conditions 576 ioflag |= IO_SYNC; 12 * are met: 577 mp = NULL; 13 * 1. Redistributions of source code must retain the above copyright 578 if (vp->v_type != VCHR && 14 * notice, this list of conditions and the following disclaimer. 579 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { 15 * 2. Redistributions in binary form must reproduce the above copyright 580 mtx_unlock(&Giant); 16 * notice, this list of conditions and the following disclaimer in the 581 return (error); 17 * documentation and/or other materials provided with the distribution. 582 } 18 * 3. All advertising materials mentioning features or use of this software 583 VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE); 19 * must display the following acknowledgement: 584 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 20 * This product includes software developed by the University of 585 if ((flags & FOF_OFFSET) == 0) 21 * California, Berkeley and its contributors. 586 uio->uio_offset = fp->f_offset; 22 * 4. Neither the name of the University nor the names of its contributors 587 ioflag |= sequential_heuristic(uio, fp); 23 * may be used to endorse or promote products derived from this software 588 #ifdef MAC 24 * without specific prior written permission. 589 error = mac_check_vnode_write(active_cred, fp->f_cred, vp); 25 * 590 if (error == 0) 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 591 #endif 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 592 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 593 if ((flags & FOF_OFFSET) == 0) 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 594 fp->f_offset = uio->uio_offset; 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 595 fp->f_nextoff = uio->uio_offset; 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 596 VOP_UNLOCK(vp, 0, td); 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 597 vn_finished_write(mp); 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 598 mtx_unlock(&Giant); 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 599 return (error); 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 600 } 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 39 */ 40 41 #include 42 __FBSDID("$FreeBSD: src/sys/kern/vfs_vnops.c,v 1.195 2003/10/04 14:35:22 jeff Exp $");

546 /* 547 * File table vnode write routine. 548 */ 549 static int 550 vn_write(fp, uio, active_cred, flags, td) 551 struct file *fp; 552 struct uio *uio; 553 struct ucred *active_cred; 554 struct thread *td; 555 int flags; 556 { 557 struct vnode *vp; 558 struct mount *mp; 559 int error, ioflag; 560 561 mtx_lock(&Giant); 562 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", 563 uio->uio_td, td)); 564 vp = fp->f_vnode; 565 if (vp->v_type == VREG) 12/03/03 13:12:09 sys/i386/i386/machdep.c 1 1 /*- 1102 regs->tf_eip = entry; 2 * Copyright (c) 1992 Terrence R. Lambert. 1103 regs->tf_esp = stack; 3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 1104 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); 4 * All rights reserved. 1105 regs->tf_ss = _udatasel; 5 * 1106 regs->tf_ds = _udatasel; 6 * This code is derived from software contributed to Berkeley by 1107 regs->tf_es = _udatasel; 7 * William Jolitz. 1108 regs->tf_fs = _udatasel; 8 * 1109 regs->tf_cs = _ucodesel; 9 * Redistribution and use in source and binary forms, with or without 1110 10 * modification, are permitted provided that the following conditions 1111 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 11 * are met: 1112 regs->tf_ebx = ps_strings; 12 * 1. Redistributions of source code must retain the above copyright 1113 13 * notice, this list of conditions and the following disclaimer. 1114 /* 14 * 2. Redistributions in binary form must reproduce the above copyright 1115 * Reset the hardware debug registers if they were in use. 15 * notice, this list of conditions and the following disclaimer in the 1116 * They won’t have any meaning for the newly exec’d process. 16 * documentation and/or other materials provided with the distribution. 1117 */ 17 * 3. All advertising materials mentioning features or use of this software 1118 if (pcb->pcb_flags & PCB_DBREGS) { 18 * must display the following acknowledgement: 1119 pcb->pcb_dr0 = 0; 19 * This product includes software developed by the University of 1120 pcb->pcb_dr1 = 0; 20 * California, Berkeley and its contributors. 1121 pcb->pcb_dr2 = 0; 21 * 4. Neither the name of the University nor the names of its contributors 1122 pcb->pcb_dr3 = 0; 22 * may be used to endorse or promote products derived from this software 1123 pcb->pcb_dr6 = 0; 23 * without specific prior written permission. 1124 pcb->pcb_dr7 = 0; 24 * 1125 if (pcb == PCPU_GET(curpcb)) { 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 1126 /* 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1127 * Clear the debug registers on the running 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1128 * CPU, otherwise they will end up affecting 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 1129 * the next process we switch to. 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1130 */ 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 1131 reset_dbregs(); 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 1132 } 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 1133 pcb->pcb_flags &= ˜PCB_DBREGS; 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 1134 } 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 1135 35 * SUCH DAMAGE. 1136 /* 36 * 1137 * Initialize the math emulator (if any) for the current process. 37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 1138 * Actually, just clear the bit that says that the emulator has 38 */ 1139 * been initialized. Initialization is delayed until the process 39 1140 * traps to the emulator (if it is done at all) mainly because 40 #include 1141 * emulators don’t provide an entry point for initialization. 41 __FBSDID("$FreeBSD: src/sys/i386/i386/machdep.c,v 1.584 2003/12/03 21:12:09 jh 1142 */ b Exp $"); 1143 td->td_pcb->pcb_flags &= ˜FP_SOFTFP; 1144 1145 /* 1081 /* 1146 * Arrange to trap the next npx or ‘fwait’ instruction (see npx.c 1082 * Clear registers on exec 1147 * for why fwait must be trapped at least if there is an npx or an 1083 */ 1148 * emulator). This is mainly to handle the case where npx0 is not 1084 void 1149 * configured, since the npx routines normally set up the trap 1085 exec_setregs(td, entry, stack, ps_strings) 1150 * otherwise. It should be done only at boot time, but doing it 1086 struct thread *td; 1151 * here allows modifying ‘npx_exists’ for testing the emulator on 1087 u_long entry; 1152 * systems with an npx. 1088 u_long stack; 1153 */ 1089 u_long ps_strings; 1154 load_cr0(rcr0() | CR0_MP | CR0_TS); 1090 { 1155 1091 struct trapframe *regs = td->td_frame; 1156 /* Initialize the npx (if any) for the current process. */ 1092 struct pcb *pcb = td->td_pcb; 1157 /* 1093 1158 * XXX the above load_cr0() also initializes it and is a layering 1094 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1159 * violation if NPX is configured. It drops the npx partially 1095 pcb->pcb_gs = _udatasel; 1160 * and this would be fatal if we were interrupted now, and decided 1096 load_gs(_udatasel); 1161 * to force the state to the pcb, and checked the invariant 1097 1162 * (CR0_TS clear) if and only if PCPU_GET(fpcurthread) != NULL). 1098 if (td->td_proc->p_md.md_ldt) 1163 * ALL of this can happen except the check. The check used to 1099 user_ldt_free(td); 1164 * happen and be fatal later when we didn’t complete the drop 1100 1165 * before returning to user mode. This should be fixed properly 1101 bzero((char *)regs, sizeof(struct trapframe)); 1166 * soon. 12/03/03 13:12:09 sys/i386/i386/machdep.c 2 1167 */ 1168 fpstate_drop(td); 1169 1170 /* 1171 * XXX - Linux emulator 1172 * Make sure sure edx is 0x0 on entry. Linux binaries depend 1173 * on it. 1174 */ 1175 td->td_retval[1] = 0; 1176 } 11/07/03 19:01:26 sys/i386/i386/pmap.c 1 1 /*- 66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2 * Copyright (c) 1991 Regents of the University of California. 67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3 * All rights reserved. 68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 4 * Copyright (c) 1994 John S. Dyson 69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 5 * All rights reserved. 70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 6 * Copyright (c) 1994 David Greenman 71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 7 * All rights reserved. 72 * SUCH DAMAGE. 8 * 73 */ 9 * This code is derived from software contributed to Berkeley by 74 10 * the Systems Programming Group of the University of Utah Computer 75 #include 11 * Science Department and William Jolitz of UUNET Technologies Inc. 76 __FBSDID("$FreeBSD: src/sys/i386/i386/pmap.c,v 1.455 2003/11/08 03:01:26 alc E 12 * xp $"); 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 2581 16 * 1. Redistributions of source code must retain the above copyright 2582 #define PMAP_REMOVE_PAGES_CURPROC_ONLY 17 * notice, this list of conditions and the following disclaimer. 2583 /* 18 * 2. Redistributions in binary form must reproduce the above copyright 2584 * Remove all pages from specified address space 19 * notice, this list of conditions and the following disclaimer in the 2585 * this aids process exit speeds. Also, this code 20 * documentation and/or other materials provided with the distribution. 2586 * is special cased for current process only, but 21 * 3. All advertising materials mentioning features or use of this software 2587 * can have the more generic (and slightly slower) 22 * must display the following acknowledgement: 2588 * mode enabled. This is much faster than pmap_remove 23 * This product includes software developed by the University of 2589 * in the case of running down an entire address space. 24 * California, Berkeley and its contributors. 2590 */ 25 * 4. Neither the name of the University nor the names of its contributors 2591 void 26 * may be used to endorse or promote products derived from this software 2592 pmap_remove_pages(pmap, sva, eva) 27 * without specific prior written permission. 2593 pmap_t pmap; 28 * 2594 vm_offset_t sva, eva; 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 2595 { 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2596 pt_entry_t *pte, tpte; 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2597 vm_page_t m; 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2598 pv_entry_t pv, npv; 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2599 int s; 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2600 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2601 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2602 if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace) 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY )) { 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2603 printf("warning: pmap_remove_pages called with non-current pma 39 * SUCH DAMAGE. p\n"); 40 * 2604 return; 41 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 2605 } 42 */ 2606 #endif 43 /*- 2607 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 44 * Copyright (c) 2003 Networks Associates Technology, Inc. 2608 s = splvm(); 45 * All rights reserved. 2609 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 46 * 2610 47 * This software was developed for the FreeBSD Project by Jake Burkholder, 2611 if (pv->pv_va >= eva || pv->pv_va < sva) { 48 * Safeport Network Services, and Network Associates Laboratories, the 2612 npv = TAILQ_NEXT(pv, pv_plist); 49 * Security Research Division of Network Associates, Inc. under 2613 continue; 50 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 2614 } 51 * CHATS research program. 2615 52 * 2616 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY 53 * Redistribution and use in source and binary forms, with or without 2617 pte = vtopte(pv->pv_va); 54 * modification, are permitted provided that the following conditions 2618 #else 55 * are met: 2619 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 56 * 1. Redistributions of source code must retain the above copyright 2620 #endif 57 * notice, this list of conditions and the following disclaimer. 2621 tpte = *pte; 58 * 2. Redistributions in binary form must reproduce the above copyright 2622 59 * notice, this list of conditions and the following disclaimer in the 2623 if (tpte == 0) { 60 * documentation and/or other materials provided with the distribution. 2624 printf("TPTE at %p IS ZERO @ VA %08x\n", 61 * 2625 pte, pv->pv_va); 62 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ‘‘AS IS’’ AND 2626 panic("bad pte"); 63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2627 } 64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2628 65 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 2629 /* 11/07/03 19:01:26 sys/i386/i386/pmap.c 2 2630 * We cannot remove wired pages from a process’ mapping at this time 2631 */ 2632 if (tpte & PG_W) { 2633 npv = TAILQ_NEXT(pv, pv_plist); 2634 continue; 2635 } 2636 2637 m = PHYS_TO_VM_PAGE(tpte); 2638 KASSERT(m->phys_addr == (tpte & PG_FRAME), 2639 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 2640 m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); 2641 2642 KASSERT(m < &vm_page_array[vm_page_array_size], 2643 ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)) ; 2644 2645 pv->pv_pmap->pm_stats.resident_count--; 2646 2647 pte_clear(pte); 2648 2649 /* 2650 * Update the vm_page_t clean and reference bits. 2651 */ 2652 if (tpte & PG_M) { 2653 vm_page_dirty(m); 2654 } 2655 2656 npv = TAILQ_NEXT(pv, pv_plist); 2657 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 2658 2659 m->md.pv_list_count--; 2660 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2661 if (TAILQ_FIRST(&m->md.pv_list) == NULL) { 2662 vm_page_flag_clear(m, PG_WRITEABLE); 2663 } 2664 2665 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); 2666 free_pv_entry(pv); 2667 } 2668 splx(s); 2669 pmap_invalidate_all(pmap); 2670 } 11/17/03 10:22:24 sys/i386/i386/vm_machdep.c 1 1 /*- 141 2 * Copyright (c) 1982, 1986 The Regents of the University of California. 142 p1 = td1->td_proc; 3 * Copyright (c) 1989, 1990 William Jolitz 143 if ((flags & RFPROC) == 0) { 4 * Copyright (c) 1994 John Dyson 144 if ((flags & RFMEM) == 0) { 5 * All rights reserved. 145 /* unshare user LDT */ 6 * 146 struct mdproc *mdp1 = &p1->p_md; 7 * This code is derived from software contributed to Berkeley by 147 struct proc_ldt *pldt = mdp1->md_ldt; 8 * the Systems Programming Group of the University of Utah Computer 148 if (pldt && pldt->ldt_refcnt > 1) { 9 * Science Department, and William Jolitz. 149 pldt = user_ldt_alloc(mdp1, pldt->ldt_len); 10 * 150 if (pldt == NULL) 11 * Redistribution and use in source and binary forms, with or without 151 panic("could not copy LDT"); 12 * modification, are permitted provided that the following conditions 152 mdp1->md_ldt = pldt; 13 * are met: 153 set_user_ldt(mdp1); 14 * 1. Redistributions of source code must retain the above copyright 154 user_ldt_free(td1); 15 * notice, this list of conditions and the following disclaimer. 155 } 16 * 2. Redistributions in binary form must reproduce the above copyright 156 } 17 * notice, this list of conditions and the following disclaimer in the 157 return; 18 * documentation and/or other materials provided with the distribution. 158 } 19 * 3. All advertising materials mentioning features or use of this software 159 20 * must display the following acknowledgement: 160 /* Ensure that p1’s pcb is up to date. */ 21 * This product includes software developed by the University of 161 #ifdef DEV_NPX 22 * California, Berkeley and its contributors. 162 if (td1 == curthread) 23 * 4. Neither the name of the University nor the names of its contributors 163 td1->td_pcb->pcb_gs = rgs(); 24 * may be used to endorse or promote products derived from this software 164 savecrit = intr_disable(); 25 * without specific prior written permission. 165 if (PCPU_GET(fpcurthread) == td1) 26 * 166 npxsave(&td1->td_pcb->pcb_save); 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 167 intr_restore(savecrit); 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 168 #endif 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 169 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 170 /* Point the pcb to the top of the stack */ 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 171 pcb2 = (struct pcb *)(td2->td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1; 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 172 td2->td_pcb = pcb2; 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 173 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 174 /* Copy p1’s pcb */ 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 175 bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 176 37 * SUCH DAMAGE. 177 /* Point mdproc and then copy over td1’s contents */ 38 * 178 mdp2 = &p2->p_md; 39 * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 179 bcopy(&p1->p_md, mdp2, sizeof(*mdp2)); 40 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ 180 41 */ 181 /* 42 182 * Create a new fresh stack for the new process. 43 #include 183 * Copy the trap frame for the return to user mode as if from a 44 __FBSDID("$FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.219 2003/11/17 18:22:24 184 * syscall. This copies most of the user mode register values. alc Exp $"); 185 * The -16 is so we can expand the trapframe if we go to vm86. 186 */ 187 td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb - 16) - 1; 123 /* 188 bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe)); 124 * Finish a fork operation, with process p2 nearly set up. 189 125 * Copy and update the pcb, set up the stack so that the child 190 td2->td_frame->tf_eax = 0; /* Child returns zero */ 126 * ready to run and return to user mode. 191 td2->td_frame->tf_eflags &= ˜PSL_C; /* success */ 127 */ 192 td2->td_frame->tf_edx = 1; 128 void 193 129 cpu_fork(td1, p2, td2, flags) 194 /* 130 register struct thread *td1; 195 * Set registers for trampoline to user mode. Leave space for the 131 register struct proc *p2; 196 * return address on stack. These are the kernel mode register values 132 struct thread *td2; . 133 int flags; 197 */ 134 { 198 #ifdef PAE 135 register struct proc *p1; 199 pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdpt); 136 struct pcb *pcb2; 200 #else 137 struct mdproc *mdp2; 201 pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdir); 138 #ifdef DEV_NPX 202 #endif 139 register_t savecrit; 203 pcb2->pcb_edi = 0; 140 #endif 204 pcb2->pcb_esi = (int)fork_return; /* fork_trampoline argument */ 11/17/03 10:22:24 sys/i386/i386/vm_machdep.c 2 205 pcb2->pcb_ebp = 0; 289 } 206 pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *); 207 pcb2->pcb_ebx = (int)td2; /* fork_trampoline argument */ 208 pcb2->pcb_eip = (int)fork_trampoline; 209 pcb2->pcb_psl = PSL_KERNEL; /* ints disabled */ 210 pcb2->pcb_gs = rgs(); 211 /*- 212 * pcb2->pcb_dr*: cloned above. 213 * pcb2->pcb_savefpu: cloned above. 214 * pcb2->pcb_flags: cloned above. 215 * pcb2->pcb_onfault: cloned above (always NULL here?). 216 * pcb2->pcb_gs: cloned above. 217 * pcb2->pcb_ext: cleared below. 218 */ 219 220 /* 221 * XXX don’t copy the i/o pages. this should probably be fixed. 222 */ 223 pcb2->pcb_ext = 0; 224 225 /* Copy the LDT, if necessary. */ 226 mtx_lock_spin(&sched_lock); 227 if (mdp2->md_ldt != 0) { 228 if (flags & RFMEM) { 229 mdp2->md_ldt->ldt_refcnt++; 230 } else { 231 mdp2->md_ldt = user_ldt_alloc(mdp2, 232 mdp2->md_ldt->ldt_len); 233 if (mdp2->md_ldt == NULL) 234 panic("could not copy LDT"); 235 } 236 } 237 mtx_unlock_spin(&sched_lock); 238 239 /* 240 * Now, cpu_switch() can schedule the new process. 241 * pcb_esp is loaded pointing to the cpu_switch() stack frame 242 * containing the return address when exiting cpu_switch. 243 * This will normally be to fork_trampoline(), which will have 244 * %ebx loaded with the new proc’s pointer. fork_trampoline() 245 * will set up a stack to call fork_return(p, frame); to complete 246 * the return to user-mode. 247 */ 248 }

270 void 271 cpu_exit(struct thread *td) 272 { 273 struct mdproc *mdp; 274 struct pcb *pcb = td->td_pcb; 275 276 277 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 278 mdp = &td->td_proc->p_md; 279 if (mdp->md_ldt) { 280 td->td_pcb->pcb_gs = _udatasel; 281 load_gs(_udatasel); 282 user_ldt_free(td); 283 } 284 if (pcb->pcb_flags & PCB_DBREGS) { 285 /* disable all hardware breakpoints */ 286 reset_dbregs(); 287 pcb->pcb_flags &= ˜PCB_DBREGS; 288 } 07/16/02 15:36:00 sys/ufs/ufs/dinode.h 1 1 /* 65 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 66 /* 3 * All rights reserved. 67 * The size of physical and logical block numbers and time fields in UFS. 4 * 68 */ 5 * This software was developed for the FreeBSD Project by Marshall 69 typedef int32_t ufs1_daddr_t; 6 * Kirk McKusick and Network Associates Laboratories, the Security 70 typedef int64_t ufs2_daddr_t; 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 71 typedef int64_t ufs_lbn_t; 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 72 typedef int64_t ufs_time_t; 9 * research program 73 10 * 74 /* File permissions. */ 11 * Copyright (c) 1982, 1989, 1993 75 #define IEXEC 0000100 /* Executable. */ 12 * The Regents of the University of California. All rights reserved. 76 #define IWRITE 0000200 /* Writeable. */ 13 * (c) UNIX System Laboratories, Inc. 77 #define IREAD 0000400 /* Readable. */ 14 * All or some portions of this file are derived from material licensed 78 #define ISVTX 0001000 /* Sticky bit. */ 15 * to the University of California by American Telephone and Telegraph 79 #define ISGID 0002000 /* Set-gid. */ 16 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 80 #define ISUID 0004000 /* Set-uid. */ 17 * the permission of UNIX System Laboratories, Inc. 81 18 * 82 /* File types. */ 19 * Redistribution and use in source and binary forms, with or without 83 #define IFMT 0170000 /* Mask of file type. */ 20 * modification, are permitted provided that the following conditions 84 #define IFIFO 0010000 /* Named pipe (fifo). */ 21 * are met: 85 #define IFCHR 0020000 /* Character device. */ 22 * 1. Redistributions of source code must retain the above copyright 86 #define IFDIR 0040000 /* Directory file. */ 23 * notice, this list of conditions and the following disclaimer. 87 #define IFBLK 0060000 /* Block device. */ 24 * 2. Redistributions in binary form must reproduce the above copyright 88 #define IFREG 0100000 /* Regular file. */ 25 * notice, this list of conditions and the following disclaimer in the 89 #define IFLNK 0120000 /* Symbolic link. */ 26 * documentation and/or other materials provided with the distribution. 90 #define IFSOCK 0140000 /* UNIX domain socket. */ 27 * 3. The names of the authors may not be used to endorse or promote 91 #define IFWHT 0160000 /* Whiteout. */ 28 * products derived from this software without specific prior written 92 29 * permission. 93 /* 30 * 94 * A dinode contains all the meta-data associated with a UFS2 file. 31 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ‘‘AS IS’’ AND 95 * This structure defines the on-disk format of a dinode. Since 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 96 * this structure describes an on-disk structure, all its fields 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 97 * are defined by types with precise widths. 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 98 */ 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 99 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 100 #define NXADDR 2 /* External addresses in inode. */ 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 101 #define NDADDR 12 /* Direct addresses in inode. */ 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 102 #define NIADDR 3 /* Indirect addresses in inode. */ 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 103 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 104 struct ufs2_dinode { 41 * SUCH DAMAGE. 105 u_int16_t di_mode; /* 0: IFMT, permissions; see below. 42 * */ 43 * @(#)dinode.h 8.3 (Berkeley) 1/21/94 106 int16_t di_nlink; /* 2: File link count. */ 44 * $FreeBSD: src/sys/ufs/ufs/dinode.h,v 1.11 2002/07/16 22:36:00 mckusick Exp 107 u_int32_t di_uid; /* 4: File owner. */ $ 108 u_int32_t di_gid; /* 8: File group. */ 45 */ 109 u_int32_t di_blksize; /* 12: Inode blocksize. */ 46 110 u_int64_t di_size; /* 16: File byte count. */ 47 #ifndef _UFS_UFS_DINODE_H_ 111 u_int64_t di_blocks; /* 24: Bytes actually held. */ 48 #define _UFS_UFS_DINODE_H_ 112 ufs_time_t di_atime; /* 32: Last access time. */ 49 113 ufs_time_t di_mtime; /* 40: Last modified time. */ 50 /* 114 ufs_time_t di_ctime; /* 48: Last inode change time. */ 51 * The root inode is the root of the filesystem. Inode 0 can’t be used for 115 ufs_time_t di_birthtime; /* 56: Inode creation time. */ 52 * normal purposes and historically bad blocks were linked to inode 1, thus 116 int32_t di_mtimensec; /* 64: Last modified time. */ 53 * the root inode is 2. (Inode 1 is no longer used for this purpose, however 117 int32_t di_atimensec; /* 68: Last access time. */ 54 * numerous dump tapes make this assumption, so we are stuck with it). 118 int32_t di_ctimensec; /* 72: Last inode change time. */ 55 */ 119 int32_t di_birthnsec; /* 76: Inode creation time. */ 56 #define ROOTINO ((ino_t)2) 120 int32_t di_gen; /* 80: Generation number. */ 57 121 u_int32_t di_kernflags; /* 84: Kernel flags. */ 58 /* 122 u_int32_t di_flags; /* 88: Status flags (chflags). */ 59 * The Whiteout inode# is a dummy non-zero inode number which will 123 int32_t di_extsize; /* 92: External attributes block. */ 60 * never be allocated to a real file. It is used as a place holder 124 ufs2_daddr_t di_extb[NXADDR];/* 96: External attributes block. */ 61 * in the directory entry which has been tagged as a DT_W entry. 125 ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */ 62 * See the comments about ROOTINO above. 126 ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */ 63 */ 127 int64_t di_spare[3]; /* 232: Reserved; currently unused */ 64 #define WINO ((ino_t)1) 128 }; 07/16/02 15:36:00 sys/ufs/ufs/dinode.h 2 129 130 /* 131 * The di_db fields may be overlaid with other information for 132 * file types that do not have associated disk storage. Block 133 * and character devices overlay the first data block with their 134 * dev_t value. Short symbolic links place their path in the 135 * di_db area. 136 */ 137 #define di_rdev di_db[0] 138 139 /* 140 * A UFS1 dinode contains all the meta-data associated with a UFS1 file. 141 * This structure defines the on-disk format of a UFS1 dinode. Since 142 * this structure describes an on-disk structure, all its fields 143 * are defined by types with precise widths. 144 */ 145 struct ufs1_dinode { 146 u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ 147 int16_t di_nlink; /* 2: File link count. */ 148 union { 149 u_int16_t oldids[2]; /* 4: Ffs: old user and group ids. * / 150 } di_u; 151 u_int64_t di_size; /* 8: File byte count. */ 152 int32_t di_atime; /* 16: Last access time. */ 153 int32_t di_atimensec; /* 20: Last access time. */ 154 int32_t di_mtime; /* 24: Last modified time. */ 155 int32_t di_mtimensec; /* 28: Last modified time. */ 156 int32_t di_ctime; /* 32: Last inode change time. */ 157 int32_t di_ctimensec; /* 36: Last inode change time. */ 158 ufs1_daddr_t di_db[NDADDR]; /* 40: Direct disk blocks. */ 159 ufs1_daddr_t di_ib[NIADDR]; /* 88: Indirect disk blocks. */ 160 u_int32_t di_flags; /* 100: Status flags (chflags). */ 161 int32_t di_blocks; /* 104: Blocks actually held. */ 162 int32_t di_gen; /* 108: Generation number. */ 163 u_int32_t di_uid; /* 112: File owner. */ 164 u_int32_t di_gid; /* 116: File group. */ 165 int32_t di_spare[2]; /* 120: Reserved; currently unused */ 166 }; 167 #define di_ogid di_u.oldids[1] 168 #define di_ouid di_u.oldids[0] 169 170 #endif /* _UFS_UFS_DINODE_H_ */ 08/15/03 13:03:19 sys/ufs/ufs/inode.h 1 1 /* 66 struct vnode *i_vnode;/* Vnode associated with this inode. */ 2 * Copyright (c) 1982, 1989, 1993 67 struct ufsmount *i_ump;/* Ufsmount point associated with this inode. 3 * The Regents of the University of California. All rights reserved. */ 4 * (c) UNIX System Laboratories, Inc. 68 u_int32_t i_flag; /* flags, see below */ 5 * All or some portions of this file are derived from material licensed 69 struct cdev *i_dev; /* Device associated with the inode. */ 6 * to the University of California by American Telephone and Telegraph 70 ino_t i_number; /* The identity of the inode. */ 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 71 int i_effnlink; /* i_nlink when I/O completes */ 8 * the permission of UNIX System Laboratories, Inc. 72 9 * 73 struct fs *i_fs; /* Associated filesystem superblock. */ 10 * Redistribution and use in source and binary forms, with or without 74 struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ 11 * modification, are permitted provided that the following conditions 75 u_quad_t i_modrev; /* Revision level for NFS lease. */ 12 * are met: 76 struct lockf *i_lockf;/* Head of byte-level lock list. */ 13 * 1. Redistributions of source code must retain the above copyright 77 /* 14 * notice, this list of conditions and the following disclaimer. 78 * Side effects; used during directory lookup. 15 * 2. Redistributions in binary form must reproduce the above copyright 79 */ 16 * notice, this list of conditions and the following disclaimer in the 80 int32_t i_count; /* Size of free slot in directory. */ 17 * documentation and/or other materials provided with the distribution. 81 doff_t i_endoff; /* End of useful stuff in directory. */ 18 * 3. All advertising materials mentioning features or use of this software 82 doff_t i_diroff; /* Offset in dir, where we found last entry. * 19 * must display the following acknowledgement: / 20 * This product includes software developed by the University of 83 doff_t i_offset; /* Offset of free space in directory. */ 21 * California, Berkeley and its contributors. 84 ino_t i_ino; /* Inode number of found directory. */ 22 * 4. Neither the name of the University nor the names of its contributors 85 u_int32_t i_reclen; /* Size of found directory entry. */ 23 * may be used to endorse or promote products derived from this software 86 24 * without specific prior written permission. 87 union { 25 * 88 struct *dirhash; /* Hashing for large directories. */ 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 89 daddr_t *snapblklist; /* Collect expunged snapshot blocks. 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE */ 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 90 } i_un; 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 91 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 92 /* 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 93 * Data for extended attribute modification. 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 94 */ 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 95 u_char *i_ea_area; /* Pointer to malloced copy of EA area */ 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 96 unsigned i_ea_len; /* Length of i_ea_area */ 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 97 int i_ea_error; /* First errno in transaction */ 36 * SUCH DAMAGE. 98 37 * 99 /* 38 * @(#)inode.h 8.9 (Berkeley) 5/14/95 100 * Copies from the on-disk dinode itself. 39 * $FreeBSD: src/sys/ufs/ufs/inode.h,v 1.44 2003/08/15 20:03:19 phk Exp $ 101 */ 40 */ 102 u_int16_t i_mode; /* IFMT, permissions; see below. */ 41 103 int16_t i_nlink; /* File link count. */ 42 #ifndef _UFS_UFS_INODE_H_ 104 u_int64_t i_size; /* File byte count. */ 43 #define _UFS_UFS_INODE_H_ 105 u_int32_t i_flags; /* Status flags (chflags). */ 44 106 int64_t i_gen; /* Generation number. */ 45 #include 107 u_int32_t i_uid; /* File owner. */ 46 #include 108 u_int32_t i_gid; /* File group. */ 47 #include 109 /* 48 110 * The real copy of the on-disk inode. 49 /* 111 */ 50 * This must agree with the definition in . 112 union { 51 */ 113 struct ufs1_dinode *din1; /* UFS1 on-disk dinode. */ 52 #define doff_t int32_t 114 struct ufs2_dinode *din2; /* UFS2 on-disk dinode. */ 53 115 } dinode_u; 54 /* 116 }; 55 * The inode is used to describe each active (or recently active) file in the 117 /* 56 * UFS filesystem. It is composed of two types of information. The first part 118 * These flags are kept in i_flag. 57 * is the information that is needed only while the file is active (such as 119 */ 58 * the identity of the file and linkage to speed its lookup). The second part 120 #define IN_ACCESS 0x0001 /* Access time update request. */ 59 * is the permanent meta-data associated with the file which is read in 121 #define IN_CHANGE 0x0002 /* Inode change time update request. * 60 * from the permanent dinode from long term storage when the file becomes / 61 * active, and is put back when the file is no longer being used. 122 #define IN_UPDATE 0x0004 /* Modification time update request. * 62 */ / 63 struct inode { 123 #define IN_MODIFIED 0x0008 /* Inode has been modified. */ 64 LIST_ENTRY(inode) i_hash;/* Hash chain. */ 124 #define IN_RENAME 0x0010 /* Inode is being renamed. */ 65 TAILQ_ENTRY(inode) i_nextsnap; /* snapshot file list. */ 125 #define IN_HASHED 0x0020 /* Inode is on hash list */ 08/15/03 13:03:19 sys/ufs/ufs/inode.h 2 126 #define IN_LAZYMOD 0x0040 /* Modified, but don’t write yet. */ 127 #define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. * / 128 129 #define i_devvp i_ump->um_devvp 130 #define i_dirhash i_un.dirhash 131 #define i_snapblklist i_un.snapblklist 132 #define i_din1 dinode_u.din1 133 #define i_din2 dinode_u.din2 134 135 #ifdef _KERNEL 136 /* 137 * The DIP macro is used to access fields in the dinode that are 138 * not cached in the inode itself. 139 */ 140 #define DIP(ip, field) \ 141 (((ip)->i_ump->um_fstype == UFS1) ? \ 142 (ip)->i_din1->d##field : (ip)->i_din2->d##field) 143 144 #define MAXSYMLINKLEN(ip) \ 145 ((ip)->i_ump->um_fstype == UFS1) ? \ 146 ((NDADDR + NIADDR) * sizeof(ufs1_daddr_t)) : \ 147 ((NDADDR + NIADDR) * sizeof(ufs2_daddr_t)) 148 #define SHORTLINK(ip) \ 149 (((ip)->i_ump->um_fstype == UFS1) ? \ 150 (caddr_t)(ip)->i_din1->di_db : (caddr_t)(ip)->i_din2->di_db) 151 152 /* 153 * Structure used to pass around logical block paths generated by 154 * ufs_getlbns and used by truncate and bmap code. 155 */ 156 struct indir { 157 ufs2_daddr_t in_lbn; /* Logical block number. */ 158 int in_off; /* Offset in buffer. */ 159 int in_exists; /* Flag if the block exists. */ 160 }; 161 162 /* Convert between inode pointers and vnode pointers. */ 163 #define VTOI(vp) ((struct inode *)(vp)->v_data) 164 #define ITOV(ip) ((ip)->i_vnode) 165 166 /* Determine if soft dependencies are being done */ 167 #define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & MNT_SOFTDEP) 168 #define DOINGASYNC(vp) ((vp)->v_mount->mnt_flag & MNT_ASYNC) 169 170 /* This overlays the fid structure (see mount.h). */ 171 struct ufid { 172 u_int16_t ufid_len; /* Length of structure. */ 173 u_int16_t ufid_pad; /* Force 32-bit alignment. */ 174 ino_t ufid_ino; /* File number (ino). */ 175 int32_t ufid_gen; /* Generation number. */ 176 }; 177 #endif /* _KERNEL */ 178 179 #endif /* !_UFS_UFS_INODE_H_ */ 10/18/03 07:10:27 sys/ufs/ufs/ufs_bmap.c 1 1 /* 323 2 * Copyright (c) 1989, 1991, 1993 324 ump = VFSTOUFS(vp->v_mount); 3 * The Regents of the University of California. All rights reserved. 325 if (nump) 4 * (c) UNIX System Laboratories, Inc. 326 *nump = 0; 5 * All or some portions of this file are derived from material licensed 327 numlevels = 0; 6 * to the University of California by American Telephone and Telegraph 328 realbn = bn; 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 329 if (bn < 0) 8 * the permission of UNIX System Laboratories, Inc. 330 bn = -bn; 9 * 331 10 * Redistribution and use in source and binary forms, with or without 332 /* The first NDADDR blocks are direct blocks. */ 11 * modification, are permitted provided that the following conditions 333 if (bn < NDADDR) 12 * are met: 334 return (0); 13 * 1. Redistributions of source code must retain the above copyright 335 14 * notice, this list of conditions and the following disclaimer. 336 /* 15 * 2. Redistributions in binary form must reproduce the above copyright 337 * Determine the number of levels of indirection. After this loop 16 * notice, this list of conditions and the following disclaimer in the 338 * is done, blockcnt indicates the number of data blocks possible 17 * documentation and/or other materials provided with the distribution. 339 * at the previous level of indirection, and NIADDR - i is the number 18 * 3. All advertising materials mentioning features or use of this software 340 * of levels of indirection needed to locate the requested block. 19 * must display the following acknowledgement: 341 */ 20 * This product includes software developed by the University of 342 for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { 21 * California, Berkeley and its contributors. 343 if (i == 0) 22 * 4. Neither the name of the University nor the names of its contributors 344 return (EFBIG); 23 * may be used to endorse or promote products derived from this software 345 blockcnt *= MNINDIR(ump); 24 * without specific prior written permission. 346 if (bn < blockcnt) 25 * 347 break; 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 348 } 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 349 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 350 /* Calculate the address of the first meta-block. */ 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 351 if (realbn >= 0) 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 352 metalbn = -(realbn - bn + NIADDR - i); 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 353 else 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 354 metalbn = -(-realbn - bn + NIADDR - i); 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 355 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 356 /* 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 357 * At each iteration, off is the offset into the bap array which is 36 * SUCH DAMAGE. 358 * an array of disk addresses at the current level of indirection. 37 * 359 * The logical block number and the offset in that block are stored 38 * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 360 * into the argument array. 39 */ 361 */ 40 362 ap->in_lbn = metalbn; 41 #include 363 ap->in_off = off = NIADDR - i; 42 __FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_bmap.c,v 1.59 2003/10/18 14:10:27 phk 364 ap->in_exists = 0; Exp $"); 365 ap++; 366 for (++numlevels; i <= NIADDR; i++) { 367 /* If searching for a meta-data block, quit when found. */ 303 /* 368 if (metalbn == realbn) 304 * Create an array of logical block number/offset pairs which represent the 369 break; 305 * path of indirect blocks required to access a data block. The first "pair" 370 306 * contains the logical block number of the appropriate single, double or 371 blockcnt /= MNINDIR(ump); 307 * triple indirect block and the offset into the inode indirect block array. 372 off = (bn / blockcnt) % MNINDIR(ump); 308 * Note, the logical block number of the inode single/double/triple indirect 373 309 * block appears twice in the array, once with the offset into the i_ib and 374 ++numlevels; 310 * once with the offset into the page itself. 375 ap->in_lbn = metalbn; 311 */ 376 ap->in_off = off; 312 int 377 ap->in_exists = 0; 313 ufs_getlbns(vp, bn, ap, nump) 378 ++ap; 314 struct vnode *vp; 379 315 ufs2_daddr_t bn; 380 metalbn -= -1 + off * blockcnt; 316 struct indir *ap; 381 } 317 int *nump; 382 if (nump) 318 { 383 *nump = numlevels; 319 ufs2_daddr_t blockcnt; 384 return (0); 320 ufs_lbn_t metalbn, realbn; 385 } 321 struct ufsmount *ump; 322 int i, numlevels, off; 04/21/04 19:15:12 sys/ufs/ufs/ufsmount.h 1 1 /* 68 u_long um_fstype; /* type of filesystem */ 2 * Copyright (c) 1982, 1986, 1989, 1993 69 struct fs *um_fs; /* pointer to superblock */ 3 * The Regents of the University of California. All rights reserved. 70 struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ 4 * 71 struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ 5 * Redistribution and use in source and binary forms, with or without 72 struct ufs_extattr_per_mount um_extattr; /* extended attrs */ 6 * modification, are permitted provided that the following conditions 73 u_long um_nindir; /* indirect ptrs per block */ 7 * are met: 74 u_long um_bptrtodb; /* indir ptr to disk block */ 8 * 1. Redistributions of source code must retain the above copyright 75 u_long um_seqinc; /* inc between seq blocks */ 9 * notice, this list of conditions and the following disclaimer. 76 long um_numindirdeps; /* indirdeps for this filesys 10 * 2. Redistributions in binary form must reproduce the above copyright */ 11 * notice, this list of conditions and the following disclaimer in the 77 time_t um_btime[MAXQUOTAS]; /* block quota time limit */ 12 * documentation and/or other materials provided with the distribution. 78 time_t um_itime[MAXQUOTAS]; /* inode quota time limit */ 17 * 3. Neither the name of the University nor the names of its contributors 79 char um_qflags[MAXQUOTAS]; /* quota specific flags */ 18 * may be used to endorse or promote products derived from this software 80 int64_t um_savedmaxfilesize; /* XXX - limit maxfilesize */ 19 * without specific prior written permission. 81 int (*um_balloc)(struct vnode *, off_t, int, struct ucred *, int, 20 * struct buf **); 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 82 int (*um_blkatoff)(struct vnode *, off_t, char **, struct buf **); 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 83 int (*um_truncate)(struct vnode *, off_t, int, struct ucred *, str 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE uct thread *); 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 84 int (*um_update)(struct vnode *, int); 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 85 int (*um_valloc)(struct vnode *, int, struct ucred *, struct vnode 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS **); 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 86 int (*um_vfree)(struct vnode *, ino_t, int); 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 87 void (*um_ifree)(struct ufsmount *, struct inode *); 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 88 }; 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 89 31 * SUCH DAMAGE. 90 #define UFS_BALLOC(aa, bb, cc, dd, ee, ff) VFSTOUFS((aa)->v_mount)->um_balloc( 32 * aa, bb, cc, dd, ee, ff) 33 * @(#)ufsmount.h 8.6 (Berkeley) 3/30/95 91 #define UFS_BLKATOFF(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_blkatoff(aa, 34 * $FreeBSD: src/sys/ufs/ufs/ufsmount.h,v 1.28 2003/01/07 18:23:50 mckusick Ex bb, cc, dd) p $ 92 #define UFS_TRUNCATE(aa, bb, cc, dd, ee) VFSTOUFS((aa)->v_mount)->um_truncate( 35 */ aa, bb, cc, dd, ee) 36 93 #define UFS_UPDATE(aa, bb) VFSTOUFS((aa)->v_mount)->um_update(aa, bb) 37 #ifndef _UFS_UFS_UFSMOUNT_H_ 94 #define UFS_VALLOC(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, 38 #define _UFS_UFS_UFSMOUNT_H_ cc, dd) 39 95 #define UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc) 40 /* 96 #define UFS_IFREE(aa, bb) ((aa)->um_ifree(aa, bb)) 41 * Arguments to mount UFS-based filesystems 97 42 */ 98 /* 43 struct ufs_args { 99 * Filesystem types 44 char *fspec; /* block special device to mount */ 100 */ 45 struct export_args export; /* network export information */ 101 #define UFS1 1 46 }; 102 #define UFS2 2 47 103 48 #ifdef _KERNEL 104 /* 49 105 * Flags describing the state of quotas. 50 #ifdef MALLOC_DECLARE 106 */ 51 MALLOC_DECLARE(M_UFSMNT); 107 #define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ 52 #endif 108 #define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ 53 109 54 struct buf; 110 /* Convert mount ptr to ufsmount ptr. */ 55 struct inode; 111 #define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) 56 struct nameidata; 112 57 struct timeval; 113 /* 58 struct ucred; 114 * Macros to access filesystem parameters in the ufsmount structure. 59 struct uio; 115 * Used by ufs_bmap. 60 struct vnode; 116 */ 61 struct ufs_extattr_per_mount; 117 #define MNINDIR(ump) ((ump)->um_nindir) 62 118 #define blkptrtodb(ump, b) ((b) << (ump)->um_bptrtodb) 63 /* This structure describes the UFS specific mount structure data. */ 119 #define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) 64 struct ufsmount { 120 #endif /* _KERNEL */ 65 struct mount *um_mountp; /* filesystem vfs structure */ 121 66 dev_t um_dev; /* device mounted */ 122 #endif 67 struct vnode *um_devvp; /* block device mounted vnode */ 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 1 1 /* 65 #include 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 66 #include 3 * All rights reserved. 67 #include 4 * 68 #include 5 * This software was developed for the FreeBSD Project by Marshall 69 6 * Kirk McKusick and Network Associates Laboratories, the Security 70 #include 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 71 #include 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 72 9 * research program 73 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, int cg, ufs2_daddr_t bpref, 10 * 74 int size); 11 * Copyright (c) 1982, 1986, 1989, 1993 75 12 * The Regents of the University of California. All rights reserved. 76 static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int); 13 * 77 static ufs2_daddr_t 14 * Redistribution and use in source and binary forms, with or without 78 ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t); 15 * modification, are permitted provided that the following conditions 79 #ifdef DIAGNOSTIC 16 * are met: 80 static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); 17 * 1. Redistributions of source code must retain the above copyright 81 #endif 18 * notice, this list of conditions and the following disclaimer. 82 static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int); 19 * 2. Redistributions in binary form must reproduce the above copyright 83 static ino_t ffs_dirpref(struct inode *); 20 * notice, this list of conditions and the following disclaimer in the 84 static ufs2_daddr_t ffs_fragextend(struct inode *, int, ufs2_daddr_t, int, int 21 * documentation and/or other materials provided with the distribution. ); 22 * 3. All advertising materials mentioning features or use of this software 85 static void ffs_fserr(struct fs *, ino_t, char *); 23 * must display the following acknowledgement: 86 static ufs2_daddr_t ffs_hashalloc 24 * This product includes software developed by the University of 87 (struct inode *, int, ufs2_daddr_t, int, allocfcn_t *); 25 * California, Berkeley and its contributors. 88 static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int); 26 * 4. Neither the name of the University nor the names of its contributors 89 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int) 27 * may be used to endorse or promote products derived from this software ; 28 * without specific prior written permission. 90 static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 29 * 91 static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 92 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 93 /* 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 94 * Allocate a block in the filesystem. 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 95 * 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 96 * The size of the requested block is given, which must be some 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 97 * multiple of fs_fsize and <= fs_bsize. 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 98 * A preference may be optionally specified. If a preference is given 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 99 * the following hierarchy is used to allocate a block: 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 100 * 1) allocate the requested block. 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 101 * 2) allocate a rotationally optimal block in the same cylinder. 40 * SUCH DAMAGE. 102 * 3) allocate a block in the same cylinder group. 41 * 103 * 4) quadradically rehash into other cylinder groups, until an 42 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 104 * available block is located. 43 */ 105 * If no block preference is given the following heirarchy is used 44 106 * to allocate a block: 45 #include 107 * 1) allocate a block in the cylinder group that contains the 46 __FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_alloc.c,v 1.116 2003/10/31 07:25:06 tr 108 * inode for the file. uckman Exp $"); 109 * 2) quadradically rehash into other cylinder groups, until an 47 110 * available block is located. 48 #include "opt_quota.h" 111 */ 49 112 int 50 #include 113 ffs_alloc(ip, lbn, bpref, size, cred, bnp) 51 #include 114 struct inode *ip; 52 #include 115 ufs2_daddr_t lbn, bpref; 53 #include 116 int size; 54 #include 117 struct ucred *cred; 55 #include 118 ufs2_daddr_t *bnp; 56 #include 119 { 57 #include 120 struct fs *fs; 58 #include 121 ufs2_daddr_t bno; 59 #include 122 int cg, reclaimed; 60 #include 123 #ifdef QUOTA 61 #include 124 int error; 62 #include 125 #endif 63 126 64 #include 127 *bnp = 0; 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 2 128 fs = ip->i_fs; 193 ufs2_daddr_t bprev; 129 #ifdef DIAGNOSTIC 194 ufs2_daddr_t bpref; 130 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 195 int osize, nsize; 131 printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 196 struct ucred *cred; 132 devtoname(ip->i_dev), (long)fs->fs_bsize, size, 197 struct buf **bpp; 133 fs->fs_fsmnt); 198 { 134 panic("ffs_alloc: bad size"); 199 struct vnode *vp; 135 } 200 struct fs *fs; 136 if (cred == NOCRED) 201 struct buf *bp; 137 panic("ffs_alloc: missing credential"); 202 int cg, request, error, reclaimed; 138 #endif /* DIAGNOSTIC */ 203 ufs2_daddr_t bno; 139 reclaimed = 0; 204 140 retry: 205 *bpp = 0; 141 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 206 vp = ITOV(ip); 142 goto nospace; 207 fs = ip->i_fs; 143 if (suser_cred(cred, PRISON_ROOT) && 208 #ifdef DIAGNOSTIC 144 freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 209 if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 145 goto nospace; 210 panic("ffs_realloccg: allocation on suspended filesystem"); 146 #ifdef QUOTA 211 if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 147 error = chkdq(ip, btodb(size), cred, 0); 212 (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 148 if (error) 213 printf( 149 return (error); 214 "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 150 #endif 215 devtoname(ip->i_dev), (long)fs->fs_bsize, osize, 151 if (bpref >= fs->fs_size) 216 nsize, fs->fs_fsmnt); 152 bpref = 0; 217 panic("ffs_realloccg: bad size"); 153 if (bpref == 0) 218 } 154 cg = ino_to_cg(fs, ip->i_number); 219 if (cred == NOCRED) 155 else 220 panic("ffs_realloccg: missing credential"); 156 cg = dtog(fs, bpref); 221 #endif /* DIAGNOSTIC */ 157 bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg); 222 reclaimed = 0; 158 if (bno > 0) { 223 retry: 159 DIP(ip, i_blocks) += btodb(size); 224 if (suser_cred(cred, PRISON_ROOT) && 160 ip->i_flag |= IN_CHANGE | IN_UPDATE; 225 freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) 161 *bnp = bno; 226 goto nospace; 162 return (0); 227 if (bprev == 0) { 163 } 228 printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 164 #ifdef QUOTA 229 devtoname(ip->i_dev), (long)fs->fs_bsize, (intmax_t)bprev, 165 /* 230 fs->fs_fsmnt); 166 * Restore user’s disk quota because allocation failed. 231 panic("ffs_realloccg: bad bprev"); 167 */ 232 } 168 (void) chkdq(ip, -btodb(size), cred, FORCE); 233 /* 169 #endif 234 * Allocate the extra space in the buffer. 170 nospace: 235 */ 171 if (fs->fs_pendingblocks > 0 && reclaimed == 0) { 236 error = bread(vp, lbprev, osize, NOCRED, &bp); 172 reclaimed = 1; 237 if (error) { 173 softdep_request_cleanup(fs, ITOV(ip)); 238 brelse(bp); 174 goto retry; 239 return (error); 175 } 240 } 176 ffs_fserr(fs, ip->i_number, "filesystem full"); 241 177 uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); 242 if (bp->b_blkno == bp->b_lblkno) { 178 return (ENOSPC); 243 if (lbprev >= NDADDR) 179 } 244 panic("ffs_realloccg: lbprev out of range"); 180 245 bp->b_blkno = fsbtodb(fs, bprev); 181 /* 246 } 182 * Reallocate a fragment to a bigger size 247 183 * 248 #ifdef QUOTA 184 * The number and size of the old block is given, and a preference 249 error = chkdq(ip, btodb(nsize - osize), cred, 0); 185 * and new size is also specified. The allocator attempts to extend 250 if (error) { 186 * the original block. Failing that, the regular block allocator is 251 brelse(bp); 187 * invoked to get an appropriate block. 252 return (error); 188 */ 253 } 189 int 254 #endif 190 ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, cred, bpp) 255 /* 191 struct inode *ip; 256 * Check for extension in the existing location. 192 ufs2_daddr_t lbprev; 257 */ 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 3 258 cg = dtog(fs, bprev); 320 if (bno > 0) { 259 bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 321 bp->b_blkno = fsbtodb(fs, bno); 260 if (bno) { 322 if (!DOINGSOFTDEP(vp)) 261 if (bp->b_blkno != fsbtodb(fs, bno)) 323 ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, 262 panic("ffs_realloccg: bad blockno"); 324 ip->i_number); 263 DIP(ip, i_blocks) += btodb(nsize - osize); 325 if (nsize < request) 264 ip->i_flag |= IN_CHANGE | IN_UPDATE; 326 ffs_blkfree(fs, ip->i_devvp, bno + numfrags(fs, nsize) 265 allocbuf(bp, nsize); , 266 bp->b_flags |= B_DONE; 327 (long)(request - nsize), ip->i_number); 267 bzero((char *)bp->b_data + osize, (u_int)nsize - osize); 328 DIP(ip, i_blocks) += btodb(nsize - osize); 268 *bpp = bp; 329 ip->i_flag |= IN_CHANGE | IN_UPDATE; 269 return (0); 330 allocbuf(bp, nsize); 270 } 331 bp->b_flags |= B_DONE; 271 /* 332 bzero((char *)bp->b_data + osize, (u_int)nsize - osize); 272 * Allocate a new disk location. 333 *bpp = bp; 273 */ 334 return (0); 274 if (bpref >= fs->fs_size) 335 } 275 bpref = 0; 336 #ifdef QUOTA 276 switch ((int)fs->fs_optim) { 337 /* 277 case FS_OPTSPACE: 338 * Restore user’s disk quota because allocation failed. 278 /* 339 */ 279 * Allocate an exact sized fragment. Although this makes 340 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 280 * best use of space, we will waste time relocating it if 341 #endif 281 * the file continues to grow. If the fragmentation is 342 brelse(bp); 282 * less than half of the minimum free reserve, we choose 343 nospace: 283 * to begin optimizing for time. 344 /* 284 */ 345 * no space available 285 request = nsize; 346 */ 286 if (fs->fs_minfree <= 5 || 347 if (fs->fs_pendingblocks > 0 && reclaimed == 0) { 287 fs->fs_cstotal.cs_nffree > 348 reclaimed = 1; 288 (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 349 softdep_request_cleanup(fs, vp); 289 break; 350 goto retry; 290 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n 351 } ", 352 ffs_fserr(fs, ip->i_number, "filesystem full"); 291 fs->fs_fsmnt); 353 uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); 292 fs->fs_optim = FS_OPTTIME; 354 return (ENOSPC); 293 break; 355 } 294 case FS_OPTTIME: 356 295 /* 357 /* 296 * At this point we have discovered a file that is trying to 358 * Reallocate a sequence of blocks into a contiguous sequence of blocks. 297 * grow a small fragment to a larger fragment. To save time, 359 * 298 * we allocate a full sized block, then free the unused portio 360 * The vnode and an array of buffer pointers for a range of sequential n. 361 * logical blocks to be made contiguous is given. The allocator attempts 299 * If the file continues to grow, the ‘ffs_fragextend’ call 362 * to find a range of sequential blocks starting as close as possible 300 * above will be able to grow it in place without further 363 * from the end of the allocation for the logical block immediately 301 * copying. If aberrant programs cause disk fragmentation to 364 * preceding the current range. If successful, the physical block numbers 302 * grow within 2% of the free reserve, we choose to begin 365 * in the buffer pointers and in the inode are changed to reflect the new 303 * optimizing for space. 366 * allocation. If unsuccessful, the allocation is left unchanged. The 304 */ 367 * success in doing the reallocation is returned. Note that the error 305 request = fs->fs_bsize; 368 * return is not reflected back to the user. Rather the previous block 306 if (fs->fs_cstotal.cs_nffree < 369 * allocation will be used. 307 (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 370 */ 308 break; 371 309 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n 372 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); ", 373 310 fs->fs_fsmnt); 374 static int doasyncfree = 1; 311 fs->fs_optim = FS_OPTSPACE; 375 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, ""); 312 break; 376 313 default: 377 static int doreallocblks = 1; 314 printf("dev = %s, optim = %ld, fs = %s\n", 378 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, " 315 devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt); "); 316 panic("ffs_realloccg: bad optim"); 379 317 /* NOTREACHED */ 380 #ifdef DEBUG 318 } 381 static volatile int prtrealloc = 0; 319 bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg); 382 #endif 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 4 383 447 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 384 int 448 return (ENOSPC); 385 ffs_reallocblks(ap) 449 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 386 struct vop_reallocblks_args /* { 450 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 387 struct vnode *a_vp; 451 return (ENOSPC); 388 struct cluster_save *a_buflist; 452 /* 389 } */ *ap; 453 * Get the starting offset and block map for the first block. 390 { 454 */ 391 455 if (start_lvl == 0) { 392 if (doreallocblks == 0) 456 sbap = &ip->i_din1->di_db[0]; 393 return (ENOSPC); 457 soff = start_lbn; 394 if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1) 458 } else { 395 return (ffs_reallocblks_ufs1(ap)); 459 idp = &start_ap[start_lvl - 1]; 396 return (ffs_reallocblks_ufs2(ap)); 460 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 397 } 461 brelse(sbp); 398 462 return (ENOSPC); 399 static int 463 } 400 ffs_reallocblks_ufs1(ap) 464 sbap = (ufs1_daddr_t *)sbp->b_data; 401 struct vop_reallocblks_args /* { 465 soff = idp->in_off; 402 struct vnode *a_vp; 466 } 403 struct cluster_save *a_buflist; 467 /* 404 } */ *ap; 468 * Find the preferred location for the cluster. 405 { 469 */ 406 struct fs *fs; 470 pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 407 struct inode *ip; 471 /* 408 struct vnode *vp; 472 * If the block range spans two block maps, get the second map. 409 struct buf *sbp, *ebp; 473 */ 410 ufs1_daddr_t *bap, *sbap, *ebap = 0; 474 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 411 struct cluster_save *buflist; 475 ssize = len; 412 ufs_lbn_t start_lbn, end_lbn; 476 } else { 413 ufs1_daddr_t soff, newblk, blkno; 477 #ifdef DIAGNOSTIC 414 ufs2_daddr_t pref; 478 if (start_ap[start_lvl-1].in_lbn == idp->in_lbn) 415 struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; 479 panic("ffs_reallocblk: start == end"); 416 int i, len, start_lvl, end_lvl, ssize; 480 #endif 417 481 ssize = len - (idp->in_off + 1); 418 vp = ap->a_vp; 482 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 419 ip = VTOI(vp); 483 goto fail; 420 fs = ip->i_fs; 484 ebap = (ufs1_daddr_t *)ebp->b_data; 421 if (fs->fs_contigsumsize <= 0) 485 } 422 return (ENOSPC); 486 /* 423 buflist = ap->a_buflist; 487 * Search the block map looking for an allocation of the desired size. 424 len = buflist->bs_nchildren; 488 */ 425 start_lbn = buflist->bs_children[0]->b_lblkno; 489 if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, 426 end_lbn = start_lbn + len - 1; 490 len, ffs_clusteralloc)) == 0) 427 #ifdef DIAGNOSTIC 491 goto fail; 428 for (i = 0; i < len; i++) 492 /* 429 if (!ffs_checkblk(ip, 493 * We have found a new contiguous block. 430 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize 494 * )) 495 * First we have to replace the old block pointers with the new 431 panic("ffs_reallocblks: unallocated block 1"); 496 * block pointers in the inode and indirect blocks associated 432 for (i = 1; i < len; i++) 497 * with the file. 433 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 498 */ 434 panic("ffs_reallocblks: non-logical cluster"); 499 #ifdef DEBUG 435 blkno = buflist->bs_children[0]->b_blkno; 500 if (prtrealloc) 436 ssize = fsbtodb(fs, fs->fs_frag); 501 printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number, 437 for (i = 1; i < len - 1; i++) 502 (intmax_t)start_lbn, (intmax_t)end_lbn); 438 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 503 #endif 439 panic("ffs_reallocblks: non-physical cluster %d", i); 504 blkno = newblk; 440 #endif 505 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 441 /* 506 if (i == ssize) { 442 * If the latest allocation is in a new cylinder group, assume that 507 bap = ebap; 443 * the filesystem has decided to move and do not force it back to 508 soff = -i; 444 * the previous cylinder group. 509 } 445 */ 510 #ifdef DIAGNOSTIC 446 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 511 if (!ffs_checkblk(ip, 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 5 512 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize 575 buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); )) 576 #ifdef DIAGNOSTIC 513 panic("ffs_reallocblks: unallocated block 2"); 577 if (!ffs_checkblk(ip, 514 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 578 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize 515 panic("ffs_reallocblks: alloc mismatch"); )) 516 #endif 579 panic("ffs_reallocblks: unallocated block 3"); 517 #ifdef DEBUG 580 #endif 518 if (prtrealloc) 581 #ifdef DEBUG 519 printf(" %d,", *bap); 582 if (prtrealloc) 520 #endif 583 printf(" %d,", blkno); 521 if (DOINGSOFTDEP(vp)) { 584 #endif 522 if (sbap == &ip->i_din1->di_db[0] && i < ssize) 585 } 523 softdep_setup_allocdirect(ip, start_lbn + i, 586 #ifdef DEBUG 524 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 587 if (prtrealloc) { 525 buflist->bs_children[i]); 588 prtrealloc--; 526 else 589 printf("\n"); 527 softdep_setup_allocindir_page(ip, start_lbn + 590 } i, 591 #endif 528 i < ssize ? sbp : ebp, soff + i, blkno, 592 return (0); 529 *bap, buflist->bs_children[i]); 593 530 } 594 fail: 531 *bap++ = blkno; 595 if (ssize < len) 532 } 596 brelse(ebp); 533 /* 597 if (sbap != &ip->i_din1->di_db[0]) 534 * Next we must write out the modified inode and indirect blocks. 598 brelse(sbp); 535 * For strict correctness, the writes should be synchronous since 599 return (ENOSPC); 536 * the old block values may have been written to disk. In practise 600 } 537 * they are almost never written, but if we are concerned about 601 538 * strict correctness, the ‘doasyncfree’ flag should be set to zero. 602 static int 539 * 603 ffs_reallocblks_ufs2(ap) 540 * The test on ‘doasyncfree’ should be changed to test a flag 604 struct vop_reallocblks_args /* { 541 * that shows whether the associated buffers and inodes have 605 struct vnode *a_vp; 542 * been written. The flag should be set when the cluster is 606 struct cluster_save *a_buflist; 543 * started and cleared whenever the buffer or inode is flushed. 607 } */ *ap; 544 * We can then check below to see if it is set, and do the 608 { 545 * synchronous write only when it has been cleared. 609 struct fs *fs; 546 */ 610 struct inode *ip; 547 if (sbap != &ip->i_din1->di_db[0]) { 611 struct vnode *vp; 548 if (doasyncfree) 612 struct buf *sbp, *ebp; 549 bdwrite(sbp); 613 ufs2_daddr_t *bap, *sbap, *ebap = 0; 550 else 614 struct cluster_save *buflist; 551 bwrite(sbp); 615 ufs_lbn_t start_lbn, end_lbn; 552 } else { 616 ufs2_daddr_t soff, newblk, blkno, pref; 553 ip->i_flag |= IN_CHANGE | IN_UPDATE; 617 struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; 554 if (!doasyncfree) 618 int i, len, start_lvl, end_lvl, ssize; 555 UFS_UPDATE(vp, 1); 619 556 } 620 vp = ap->a_vp; 557 if (ssize < len) { 621 ip = VTOI(vp); 558 if (doasyncfree) 622 fs = ip->i_fs; 559 bdwrite(ebp); 623 if (fs->fs_contigsumsize <= 0) 560 else 624 return (ENOSPC); 561 bwrite(ebp); 625 buflist = ap->a_buflist; 562 } 626 len = buflist->bs_nchildren; 563 /* 627 start_lbn = buflist->bs_children[0]->b_lblkno; 564 * Last, free the old blocks and assign the new blocks to the buffers. 628 end_lbn = start_lbn + len - 1; 565 */ 629 #ifdef DIAGNOSTIC 566 #ifdef DEBUG 630 for (i = 0; i < len; i++) 567 if (prtrealloc) 631 if (!ffs_checkblk(ip, 568 printf("\n\tnew:"); 632 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize 569 #endif )) 570 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 633 panic("ffs_reallocblks: unallocated block 1"); 571 if (!DOINGSOFTDEP(vp)) 634 for (i = 1; i < len; i++) 572 ffs_blkfree(fs, ip->i_devvp, 635 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 573 dbtofsb(fs, buflist->bs_children[i]->b_blkno), 636 panic("ffs_reallocblks: non-logical cluster"); 574 fs->fs_bsize, ip->i_number); 637 blkno = buflist->bs_children[0]->b_blkno; 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 6 638 ssize = fsbtodb(fs, fs->fs_frag); 703 printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number, 639 for (i = 1; i < len - 1; i++) 704 (intmax_t)start_lbn, (intmax_t)end_lbn); 640 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 705 #endif 641 panic("ffs_reallocblks: non-physical cluster %d", i); 706 blkno = newblk; 642 #endif 707 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 643 /* 708 if (i == ssize) { 644 * If the latest allocation is in a new cylinder group, assume that 709 bap = ebap; 645 * the filesystem has decided to move and do not force it back to 710 soff = -i; 646 * the previous cylinder group. 711 } 647 */ 712 #ifdef DIAGNOSTIC 648 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 713 if (!ffs_checkblk(ip, 649 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 714 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize 650 return (ENOSPC); )) 651 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 715 panic("ffs_reallocblks: unallocated block 2"); 652 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 716 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 653 return (ENOSPC); 717 panic("ffs_reallocblks: alloc mismatch"); 654 /* 718 #endif 655 * Get the starting offset and block map for the first block. 719 #ifdef DEBUG 656 */ 720 if (prtrealloc) 657 if (start_lvl == 0) { 721 printf(" %jd,", (intmax_t)*bap); 658 sbap = &ip->i_din2->di_db[0]; 722 #endif 659 soff = start_lbn; 723 if (DOINGSOFTDEP(vp)) { 660 } else { 724 if (sbap == &ip->i_din2->di_db[0] && i < ssize) 661 idp = &start_ap[start_lvl - 1]; 725 softdep_setup_allocdirect(ip, start_lbn + i, 662 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 726 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 663 brelse(sbp); 727 buflist->bs_children[i]); 664 return (ENOSPC); 728 else 665 } 729 softdep_setup_allocindir_page(ip, start_lbn + 666 sbap = (ufs2_daddr_t *)sbp->b_data; i, 667 soff = idp->in_off; 730 i < ssize ? sbp : ebp, soff + i, blkno, 668 } 731 *bap, buflist->bs_children[i]); 669 /* 732 } 670 * Find the preferred location for the cluster. 733 *bap++ = blkno; 671 */ 734 } 672 pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 735 /* 673 /* 736 * Next we must write out the modified inode and indirect blocks. 674 * If the block range spans two block maps, get the second map. 737 * For strict correctness, the writes should be synchronous since 675 */ 738 * the old block values may have been written to disk. In practise 676 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 739 * they are almost never written, but if we are concerned about 677 ssize = len; 740 * strict correctness, the ‘doasyncfree’ flag should be set to zero. 678 } else { 741 * 679 #ifdef DIAGNOSTIC 742 * The test on ‘doasyncfree’ should be changed to test a flag 680 if (start_ap[start_lvl-1].in_lbn == idp->in_lbn) 743 * that shows whether the associated buffers and inodes have 681 panic("ffs_reallocblk: start == end"); 744 * been written. The flag should be set when the cluster is 682 #endif 745 * started and cleared whenever the buffer or inode is flushed. 683 ssize = len - (idp->in_off + 1); 746 * We can then check below to see if it is set, and do the 684 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 747 * synchronous write only when it has been cleared. 685 goto fail; 748 */ 686 ebap = (ufs2_daddr_t *)ebp->b_data; 749 if (sbap != &ip->i_din2->di_db[0]) { 687 } 750 if (doasyncfree) 688 /* 751 bdwrite(sbp); 689 * Search the block map looking for an allocation of the desired size. 752 else 690 */ 753 bwrite(sbp); 691 if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, 754 } else { 692 len, ffs_clusteralloc)) == 0) 755 ip->i_flag |= IN_CHANGE | IN_UPDATE; 693 goto fail; 756 if (!doasyncfree) 694 /* 757 UFS_UPDATE(vp, 1); 695 * We have found a new contiguous block. 758 } 696 * 759 if (ssize < len) { 697 * First we have to replace the old block pointers with the new 760 if (doasyncfree) 698 * block pointers in the inode and indirect blocks associated 761 bdwrite(ebp); 699 * with the file. 762 else 700 */ 763 bwrite(ebp); 701 #ifdef DEBUG 764 } 702 if (prtrealloc) 765 /* 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 7 766 * Last, free the old blocks and assign the new blocks to the buffers. 830 ino_t ino, ipref; 767 */ 831 int cg, error; 768 #ifdef DEBUG 832 769 if (prtrealloc) 833 *vpp = NULL; 770 printf("\n\tnew:"); 834 pip = VTOI(pvp); 771 #endif 835 fs = pip->i_fs; 772 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 836 if (fs->fs_cstotal.cs_nifree == 0) 773 if (!DOINGSOFTDEP(vp)) 837 goto noinodes; 774 ffs_blkfree(fs, ip->i_devvp, 838 775 dbtofsb(fs, buflist->bs_children[i]->b_blkno), 839 if ((mode & IFMT) == IFDIR) 776 fs->fs_bsize, ip->i_number); 840 ipref = ffs_dirpref(pip); 777 buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); 841 else 778 #ifdef DIAGNOSTIC 842 ipref = pip->i_number; 779 if (!ffs_checkblk(ip, 843 if (ipref >= fs->fs_ncg * fs->fs_ipg) 780 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize 844 ipref = 0; )) 845 cg = ino_to_cg(fs, ipref); 781 panic("ffs_reallocblks: unallocated block 3"); 846 /* 782 #endif 847 * Track number of dirs created one after another 783 #ifdef DEBUG 848 * in a same cg without intervening by files. 784 if (prtrealloc) 849 */ 785 printf(" %jd,", (intmax_t)blkno); 850 if ((mode & IFMT) == IFDIR) { 786 #endif 851 if (fs->fs_contigdirs[cg] < 255) 787 } 852 fs->fs_contigdirs[cg]++; 788 #ifdef DEBUG 853 } else { 789 if (prtrealloc) { 854 if (fs->fs_contigdirs[cg] > 0) 790 prtrealloc--; 855 fs->fs_contigdirs[cg]--; 791 printf("\n"); 856 } 792 } 857 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 793 #endif 858 (allocfcn_t *)ffs_nodealloccg); 794 return (0); 859 if (ino == 0) 795 860 goto noinodes; 796 fail: 861 error = VFS_VGET(pvp->v_mount, ino, LK_EXCLUSIVE, vpp); 797 if (ssize < len) 862 if (error) { 798 brelse(ebp); 863 UFS_VFREE(pvp, ino, mode); 799 if (sbap != &ip->i_din2->di_db[0]) 864 return (error); 800 brelse(sbp); 865 } 801 return (ENOSPC); 866 ip = VTOI(*vpp); 802 } 867 if (ip->i_mode) { 803 868 printf("mode = 0%o, inum = %lu, fs = %s\n", 804 /* 869 ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt); 805 * Allocate an inode in the filesystem. 870 panic("ffs_valloc: dup alloc"); 806 * 871 } 807 * If allocating a directory, use ffs_dirpref to select the inode. 872 if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX * 808 * If allocating in a directory, the following hierarchy is followed: / 809 * 1) allocate the preferred inode. 873 printf("free inode %s/%lu had %ld blocks\n", 810 * 2) allocate an inode in the same cylinder group. 874 fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); 811 * 3) quadradically rehash into other cylinder groups, until an 875 DIP(ip, i_blocks) = 0; 812 * available inode is located. 876 } 813 * If no inode preference is given the following heirarchy is used 877 ip->i_flags = 0; 814 * to allocate an inode: 878 DIP(ip, i_flags) = 0; 815 * 1) allocate an inode in cylinder group 0. 879 /* 816 * 2) quadradically rehash into other cylinder groups, until an 880 * Set up a new generation number for this inode. 817 * available inode is located. 881 */ 818 */ 882 if (ip->i_gen == 0 || ++ip->i_gen == 0) 819 int 883 ip->i_gen = arc4random() / 2 + 1; 820 ffs_valloc(pvp, mode, cred, vpp) 884 DIP(ip, i_gen) = ip->i_gen; 821 struct vnode *pvp; 885 if (fs->fs_magic == FS_UFS2_MAGIC) { 822 int mode; 886 vfs_timestamp(&ts); 823 struct ucred *cred; 887 ip->i_din2->di_birthtime = ts.tv_sec; 824 struct vnode **vpp; 888 ip->i_din2->di_birthnsec = ts.tv_nsec; 825 { 889 } 826 struct inode *pip; 890 return (0); 827 struct fs *fs; 891 noinodes: 828 struct inode *ip; 892 ffs_fserr(fs, pip->i_number, "out of inodes"); 829 struct timespec ts; 893 uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt) 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 8 ; 958 if (minifree < 1) 894 return (ENOSPC); 959 minifree = 1; 895 } 960 minbfree = avgbfree - avgbfree / 4; 896 961 if (minbfree < 1) 897 /* 962 minbfree = 1; 898 * Find a cylinder group to place a directory. 963 cgsize = fs->fs_fsize * fs->fs_fpg; 899 * 964 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 900 * The policy implemented by this algorithm is to allocate a 965 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 901 * directory inode in the same cylinder group as its parent 0; 902 * directory, but also to reserve space for its files inodes 966 if (dirsize < curdirsize) 903 * and data. Restrict the number of directories which may be 967 dirsize = curdirsize; 904 * allocated one after another in the same cylinder group 968 maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 905 * without intervening allocation of files. 969 if (fs->fs_avgfpdir > 0) 906 * 970 maxcontigdirs = min(maxcontigdirs, 907 * If we allocate a first level directory then force allocation 971 fs->fs_ipg / fs->fs_avgfpdir); 908 * in another cylinder group. 972 if (maxcontigdirs == 0) 909 */ 973 maxcontigdirs = 1; 910 static ino_t 974 911 ffs_dirpref(pip) 975 /* 912 struct inode *pip; 976 * Limit number of dirs in one cg and reserve space for 913 { 977 * regular files, but only if we have no deficit in 914 struct fs *fs; 978 * inodes or space. 915 int cg, prefcg, dirsize, cgsize; 979 */ 916 int avgifree, avgbfree, avgndir, curdirsize; 980 prefcg = ino_to_cg(fs, pip->i_number); 917 int minifree, minbfree, maxndir; 981 for (cg = prefcg; cg < fs->fs_ncg; cg++) 918 int mincg, minndir; 982 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 919 int maxcontigdirs; 983 fs->fs_cs(fs, cg).cs_nifree >= minifree && 920 984 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 921 fs = pip->i_fs; 985 if (fs->fs_contigdirs[cg] < maxcontigdirs) 922 986 return ((ino_t)(fs->fs_ipg * cg)); 923 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 987 } 924 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 988 for (cg = 0; cg < prefcg; cg++) 925 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 989 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 926 990 fs->fs_cs(fs, cg).cs_nifree >= minifree && 927 /* 991 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 928 * Force allocation in another cg if creating a first level dir. 992 if (fs->fs_contigdirs[cg] < maxcontigdirs) 929 */ 993 return ((ino_t)(fs->fs_ipg * cg)); 930 ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); 994 } 931 if (ITOV(pip)->v_vflag & VV_ROOT) { 995 /* 932 prefcg = arc4random() % fs->fs_ncg; 996 * This is a backstop when we have deficit in space. 933 mincg = prefcg; 997 */ 934 minndir = fs->fs_ipg; 998 for (cg = prefcg; cg < fs->fs_ncg; cg++) 935 for (cg = prefcg; cg < fs->fs_ncg; cg++) 999 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 936 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 1000 return ((ino_t)(fs->fs_ipg * cg)); 937 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 1001 for (cg = 0; cg < prefcg; cg++) 938 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1002 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 939 mincg = cg; 1003 break; 940 minndir = fs->fs_cs(fs, cg).cs_ndir; 1004 return ((ino_t)(fs->fs_ipg * cg)); 941 } 1005 } 942 for (cg = 0; cg < prefcg; cg++) 1006 943 if (fs->fs_cs(fs, cg).cs_ndir < minndir && 1007 /* 944 fs->fs_cs(fs, cg).cs_nifree >= avgifree && 1008 * Select the desired position for the next block in a file. The file is 945 fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1009 * logically divided into sections. The first section is composed of the 946 mincg = cg; 1010 * direct blocks. Each additional section contains fs_maxbpg blocks. 947 minndir = fs->fs_cs(fs, cg).cs_ndir; 1011 * 948 } 1012 * If no blocks have been allocated in the first section, the policy is to 949 return ((ino_t)(fs->fs_ipg * mincg)); 1013 * request a block in the same cylinder group as the inode that describes 950 } 1014 * the file. If no blocks have been allocated in any other section, the 951 1015 * policy is to place the section in a cylinder group with a greater than 952 /* 1016 * average number of free blocks. An appropriate cylinder group is found 953 * Count various limits which used for 1017 * by using a rotor that sweeps the cylinder groups. When a new group of 954 * optimal allocation of a directory inode. 1018 * blocks is needed, the sweep begins in the cylinder group following the 955 */ 1019 * cylinder group from which the previous allocation was made. The sweep 956 maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); 1020 * continues until a cylinder group with greater than the average number 957 minifree = avgifree - avgifree / 4; 1021 * of free blocks is found. If the allocation is for the first block in an 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 9 1022 * indirect block, the information on the previous allocation is unavailable; 1087 struct fs *fs; 1023 * here a best guess is made based upon the logical block number being 1088 int cg; 1024 * allocated. 1089 int avgbfree, startcg; 1025 * 1090 1026 * If a section is already partially allocated, the policy is to 1091 fs = ip->i_fs; 1027 * contiguously allocate fs_maxcontig blocks. The end of one of these 1092 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 1028 * contiguous blocks and the beginning of the next is laid out 1093 if (lbn < NDADDR + NINDIR(fs)) { 1029 * contiguously if possible. 1094 cg = ino_to_cg(fs, ip->i_number); 1030 */ 1095 return (fs->fs_fpg * cg + fs->fs_frag); 1031 ufs2_daddr_t 1096 } 1032 ffs_blkpref_ufs1(ip, lbn, indx, bap) 1097 /* 1033 struct inode *ip; 1098 * Find a cylinder with greater than average number of 1034 ufs_lbn_t lbn; 1099 * unused data blocks. 1035 int indx; 1100 */ 1036 ufs1_daddr_t *bap; 1101 if (indx == 0 || bap[indx - 1] == 0) 1037 { 1102 startcg = 1038 struct fs *fs; 1103 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 1039 int cg; 1104 else 1040 int avgbfree, startcg; 1105 startcg = dtog(fs, bap[indx - 1]) + 1; 1041 1106 startcg %= fs->fs_ncg; 1042 fs = ip->i_fs; 1107 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1043 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 1108 for (cg = startcg; cg < fs->fs_ncg; cg++) 1044 if (lbn < NDADDR + NINDIR(fs)) { 1109 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1045 cg = ino_to_cg(fs, ip->i_number); 1110 fs->fs_cgrotor = cg; 1046 return (fs->fs_fpg * cg + fs->fs_frag); 1111 return (fs->fs_fpg * cg + fs->fs_frag); 1047 } 1112 } 1048 /* 1113 for (cg = 0; cg <= startcg; cg++) 1049 * Find a cylinder with greater than average number of 1114 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1050 * unused data blocks. 1115 fs->fs_cgrotor = cg; 1051 */ 1116 return (fs->fs_fpg * cg + fs->fs_frag); 1052 if (indx == 0 || bap[indx - 1] == 0) 1117 } 1053 startcg = 1118 return (0); 1054 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; 1119 } 1055 else 1120 /* 1056 startcg = dtog(fs, bap[indx - 1]) + 1; 1121 * We just always try to lay things out contiguously. 1057 startcg %= fs->fs_ncg; 1122 */ 1058 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1123 return (bap[indx - 1] + fs->fs_frag); 1059 for (cg = startcg; cg < fs->fs_ncg; cg++) 1124 } 1060 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1125 1061 fs->fs_cgrotor = cg; 1126 /* 1062 return (fs->fs_fpg * cg + fs->fs_frag); 1127 * Implement the cylinder overflow algorithm. 1063 } 1128 * 1064 for (cg = 0; cg <= startcg; cg++) 1129 * The policy implemented by this algorithm is: 1065 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1130 * 1) allocate the block in its requested cylinder group. 1066 fs->fs_cgrotor = cg; 1131 * 2) quadradically rehash on the cylinder group number. 1067 return (fs->fs_fpg * cg + fs->fs_frag); 1132 * 3) brute force search for a free block. 1068 } 1133 */ 1069 return (0); 1134 /*VARARGS5*/ 1070 } 1135 static ufs2_daddr_t 1071 /* 1136 ffs_hashalloc(ip, cg, pref, size, allocator) 1072 * We just always try to lay things out contiguously. 1137 struct inode *ip; 1073 */ 1138 int cg; 1074 return (bap[indx - 1] + fs->fs_frag); 1139 ufs2_daddr_t pref; 1075 } 1140 int size; /* size for data blocks, mode for inodes */ 1076 1141 allocfcn_t *allocator; 1077 /* 1142 { 1078 * Same as above, but for UFS2 1143 struct fs *fs; 1079 */ 1144 ufs2_daddr_t result; 1080 ufs2_daddr_t 1145 int i, icg = cg; 1081 ffs_blkpref_ufs2(ip, lbn, indx, bap) 1146 1082 struct inode *ip; 1147 #ifdef DIAGNOSTIC 1083 ufs_lbn_t lbn; 1148 if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 1084 int indx; 1149 panic("ffs_hashalloc: allocation on suspended filesystem"); 1085 ufs2_daddr_t *bap; 1150 #endif 1086 { 1151 fs = ip->i_fs; 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 10 1152 /* 1217 (int)fs->fs_cgsize, NOCRED, &bp); 1153 * 1: preferred cylinder group 1218 if (error) { 1154 */ 1219 brelse(bp); 1155 result = (*allocator)(ip, cg, pref, size); 1220 return (0); 1156 if (result) 1221 } 1157 return (result); 1222 cgp = (struct cg *)bp->b_data; 1158 /* 1223 if (!cg_chkmagic(cgp)) { 1159 * 2: quadratic rehash 1224 brelse(bp); 1160 */ 1225 return (0); 1161 for (i = 1; i < fs->fs_ncg; i *= 2) { 1226 } 1162 cg += i; 1227 bp->b_xflags |= BX_BKGRDWRITE; 1163 if (cg >= fs->fs_ncg) 1228 cgp->cg_old_time = cgp->cg_time = time_second; 1164 cg -= fs->fs_ncg; 1229 bno = dtogd(fs, bprev); 1165 result = (*allocator)(ip, cg, 0, size); 1230 blksfree = cg_blksfree(cgp); 1166 if (result) 1231 for (i = numfrags(fs, osize); i < frags; i++) 1167 return (result); 1232 if (isclr(blksfree, bno + i)) { 1168 } 1233 brelse(bp); 1169 /* 1234 return (0); 1170 * 3: brute force search 1235 } 1171 * Note that we start at i == 2, since 0 was checked initially, 1236 /* 1172 * and 1 is always checked in the quadratic rehash. 1237 * the current fragment can be extended 1173 */ 1238 * deduct the count on fragment being extended into 1174 cg = (icg + 2) % fs->fs_ncg; 1239 * increase the count on the remaining fragment (if any) 1175 for (i = 2; i < fs->fs_ncg; i++) { 1240 * allocate the extended piece 1176 result = (*allocator)(ip, cg, 0, size); 1241 */ 1177 if (result) 1242 for (i = frags; i < fs->fs_frag - bbase; i++) 1178 return (result); 1243 if (isclr(blksfree, bno + i)) 1179 cg++; 1244 break; 1180 if (cg == fs->fs_ncg) 1245 cgp->cg_frsum[i - numfrags(fs, osize)]--; 1181 cg = 0; 1246 if (i != frags) 1182 } 1247 cgp->cg_frsum[i - frags]++; 1183 return (0); 1248 for (i = numfrags(fs, osize); i < frags; i++) { 1184 } 1249 clrbit(blksfree, bno + i); 1185 1250 cgp->cg_cs.cs_nffree--; 1186 /* 1251 fs->fs_cstotal.cs_nffree--; 1187 * Determine whether a fragment can be extended. 1252 fs->fs_cs(fs, cg).cs_nffree--; 1188 * 1253 } 1189 * Check to see if the necessary fragments are available, and 1254 fs->fs_fmod = 1; 1190 * if they are, allocate them. 1255 if (DOINGSOFTDEP(ITOV(ip))) 1191 */ 1256 softdep_setup_blkmapdep(bp, fs, bprev); 1192 static ufs2_daddr_t 1257 if (fs->fs_active != 0) 1193 ffs_fragextend(ip, cg, bprev, osize, nsize) 1258 atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 1194 struct inode *ip; 1259 bdwrite(bp); 1195 int cg; 1260 return (bprev); 1196 ufs2_daddr_t bprev; 1261 } 1197 int osize, nsize; 1262 1198 { 1263 /* 1199 struct fs *fs; 1264 * Determine whether a block can be allocated. 1200 struct cg *cgp; 1265 * 1201 struct buf *bp; 1266 * Check to see if a block of the appropriate size is available, 1202 long bno; 1267 * and if it is, allocate it. 1203 int frags, bbase; 1268 */ 1204 int i, error; 1269 static ufs2_daddr_t 1205 u_int8_t *blksfree; 1270 ffs_alloccg(ip, cg, bpref, size) 1206 1271 struct inode *ip; 1207 fs = ip->i_fs; 1272 int cg; 1208 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 1273 ufs2_daddr_t bpref; 1209 return (0); 1274 int size; 1210 frags = numfrags(fs, nsize); 1275 { 1211 bbase = fragnum(fs, bprev); 1276 struct fs *fs; 1212 if (bbase > fragnum(fs, (bprev + frags - 1))) { 1277 struct cg *cgp; 1213 /* cannot extend across a block boundary */ 1278 struct buf *bp; 1214 return (0); 1279 ufs1_daddr_t bno; 1215 } 1280 ufs2_daddr_t blkno; 1216 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 1281 int i, allocsiz, error, frags; 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 11 1282 u_int8_t *blksfree; 1345 return (0); 1283 1346 } 1284 fs = ip->i_fs; 1347 for (i = 0; i < frags; i++) 1285 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1348 clrbit(blksfree, bno + i); 1286 return (0); 1349 cgp->cg_cs.cs_nffree -= frags; 1287 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 1350 fs->fs_cstotal.cs_nffree -= frags; 1288 (int)fs->fs_cgsize, NOCRED, &bp); 1351 fs->fs_cs(fs, cg).cs_nffree -= frags; 1289 if (error) { 1352 fs->fs_fmod = 1; 1290 brelse(bp); 1353 cgp->cg_frsum[allocsiz]--; 1291 return (0); 1354 if (frags != allocsiz) 1292 } 1355 cgp->cg_frsum[allocsiz - frags]++; 1293 cgp = (struct cg *)bp->b_data; 1356 blkno = cg * fs->fs_fpg + bno; 1294 if (!cg_chkmagic(cgp) || 1357 if (DOINGSOFTDEP(ITOV(ip))) 1295 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { 1358 softdep_setup_blkmapdep(bp, fs, blkno); 1296 brelse(bp); 1359 if (fs->fs_active != 0) 1297 return (0); 1360 atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 1298 } 1361 bdwrite(bp); 1299 bp->b_xflags |= BX_BKGRDWRITE; 1362 return (blkno); 1300 cgp->cg_old_time = cgp->cg_time = time_second; 1363 } 1301 if (size == fs->fs_bsize) { 1364 1302 blkno = ffs_alloccgblk(ip, bp, bpref); 1365 /* 1303 if (fs->fs_active != 0) 1366 * Allocate a block in a cylinder group. 1304 atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg) 1367 * ); 1368 * This algorithm implements the following policy: 1305 bdwrite(bp); 1369 * 1) allocate the requested block. 1306 return (blkno); 1370 * 2) allocate a rotationally optimal block in the same cylinder. 1307 } 1371 * 3) allocate the next available block on the block rotor for the 1308 /* 1372 * specified cylinder group. 1309 * check to see if any fragments are already available 1373 * Note that this routine only allocates fs_bsize blocks; these 1310 * allocsiz is the size which will be allocated, hacking 1374 * blocks may be fragmented by the routine that allocates them. 1311 * it down to a smaller size if necessary 1375 */ 1312 */ 1376 static ufs2_daddr_t 1313 blksfree = cg_blksfree(cgp); 1377 ffs_alloccgblk(ip, bp, bpref) 1314 frags = numfrags(fs, size); 1378 struct inode *ip; 1315 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1379 struct buf *bp; 1316 if (cgp->cg_frsum[allocsiz] != 0) 1380 ufs2_daddr_t bpref; 1317 break; 1381 { 1318 if (allocsiz == fs->fs_frag) { 1382 struct fs *fs; 1319 /* 1383 struct cg *cgp; 1320 * no fragments were available, so a block will be 1384 ufs1_daddr_t bno; 1321 * allocated, and hacked up 1385 ufs2_daddr_t blkno; 1322 */ 1386 u_int8_t *blksfree; 1323 if (cgp->cg_cs.cs_nbfree == 0) { 1387 1324 brelse(bp); 1388 fs = ip->i_fs; 1325 return (0); 1389 cgp = (struct cg *)bp->b_data; 1326 } 1390 blksfree = cg_blksfree(cgp); 1327 blkno = ffs_alloccgblk(ip, bp, bpref); 1391 if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) { 1328 bno = dtogd(fs, blkno); 1392 bpref = cgp->cg_rotor; 1329 for (i = frags; i < fs->fs_frag; i++) 1393 } else { 1330 setbit(blksfree, bno + i); 1394 bpref = blknum(fs, bpref); 1331 i = fs->fs_frag - frags; 1395 bno = dtogd(fs, bpref); 1332 cgp->cg_cs.cs_nffree += i; 1396 /* 1333 fs->fs_cstotal.cs_nffree += i; 1397 * if the requested block is available, use it 1334 fs->fs_cs(fs, cg).cs_nffree += i; 1398 */ 1335 fs->fs_fmod = 1; 1399 if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1336 cgp->cg_frsum[i]++; 1400 goto gotit; 1337 if (fs->fs_active != 0) 1401 } 1338 atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg) 1402 /* ); 1403 * Take the next available block in this cylinder group. 1339 bdwrite(bp); 1404 */ 1340 return (blkno); 1405 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1341 } 1406 if (bno < 0) 1342 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1407 return (0); 1343 if (bno < 0) { 1408 cgp->cg_rotor = bno; 1344 brelse(bp); 1409 gotit: 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 12 1410 blkno = fragstoblks(fs, bno); 1475 break; 1411 ffs_clrblock(fs, blksfree, (long)blkno); 1476 fs->fs_maxcluster[cg] = i; 1412 ffs_clusteracct(fs, cgp, blkno, -1); 1477 goto fail; 1413 cgp->cg_cs.cs_nbfree--; 1478 } 1414 fs->fs_cstotal.cs_nbfree--; 1479 /* 1415 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1480 * Search the cluster map to find a big enough cluster. 1416 fs->fs_fmod = 1; 1481 * We take the first one that we find, even if it is larger 1417 blkno = cgp->cg_cgx * fs->fs_fpg + bno; 1482 * than we need as we prefer to get one close to the previous 1418 if (DOINGSOFTDEP(ITOV(ip))) 1483 * block allocation. We do not search before the current 1419 softdep_setup_blkmapdep(bp, fs, blkno); 1484 * preference point as we do not want to allocate a block 1420 return (blkno); 1485 * that is allocated before the previous one (as we will 1421 } 1486 * then have to wait for another pass of the elevator 1422 1487 * algorithm before it will be read). We prefer to fail and 1423 /* 1488 * be recalled to try an allocation in the next cylinder group. 1424 * Determine whether a cluster can be allocated. 1489 */ 1425 * 1490 if (dtog(fs, bpref) != cg) 1426 * We do not currently check for optimal rotational layout if there 1491 bpref = 0; 1427 * are multiple choices in the same cylinder group. Instead we just 1492 else 1428 * take the first one that we find following bpref. 1493 bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref))); 1429 */ 1494 mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 1430 static ufs2_daddr_t 1495 map = *mapp++; 1431 ffs_clusteralloc(ip, cg, bpref, len) 1496 bit = 1 << (bpref % NBBY); 1432 struct inode *ip; 1497 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 1433 int cg; 1498 if ((map & bit) == 0) { 1434 ufs2_daddr_t bpref; 1499 run = 0; 1435 int len; 1500 } else { 1436 { 1501 run++; 1437 struct fs *fs; 1502 if (run == len) 1438 struct cg *cgp; 1503 break; 1439 struct buf *bp; 1504 } 1440 int i, run, bit, map, got; 1505 if ((got & (NBBY - 1)) != (NBBY - 1)) { 1441 ufs2_daddr_t bno; 1506 bit <<= 1; 1442 u_char *mapp; 1507 } else { 1443 int32_t *lp; 1508 map = *mapp++; 1444 u_int8_t *blksfree; 1509 bit = 1; 1445 1510 } 1446 fs = ip->i_fs; 1511 } 1447 if (fs->fs_maxcluster[cg] < len) 1512 if (got >= cgp->cg_nclusterblks) 1448 return (0); 1513 goto fail; 1449 if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, 1514 /* 1450 NOCRED, &bp)) 1515 * Allocate the cluster that we have found. 1451 goto fail; 1516 */ 1452 cgp = (struct cg *)bp->b_data; 1517 blksfree = cg_blksfree(cgp); 1453 if (!cg_chkmagic(cgp)) 1518 for (i = 1; i <= len; i++) 1454 goto fail; 1519 if (!ffs_isblock(fs, blksfree, got - run + i)) 1455 bp->b_xflags |= BX_BKGRDWRITE; 1520 panic("ffs_clusteralloc: map mismatch"); 1456 /* 1521 bno = cg * fs->fs_fpg + blkstofrags(fs, got - run + 1); 1457 * Check to see if a cluster of the needed size (or bigger) is 1522 if (dtog(fs, bno) != cg) 1458 * available in this cylinder group. 1523 panic("ffs_clusteralloc: allocated out of group"); 1459 */ 1524 len = blkstofrags(fs, len); 1460 lp = &cg_clustersum(cgp)[len]; 1525 for (i = 0; i < len; i += fs->fs_frag) 1461 for (i = len; i <= fs->fs_contigsumsize; i++) 1526 if (ffs_alloccgblk(ip, bp, bno + i) != bno + i) 1462 if (*lp++ > 0) 1527 panic("ffs_clusteralloc: lost block"); 1463 break; 1528 if (fs->fs_active != 0) 1464 if (i > fs->fs_contigsumsize) { 1529 atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 1465 /* 1530 bdwrite(bp); 1466 * This is the first time looking for a cluster in this 1531 return (bno); 1467 * cylinder group. Update the cluster summary information 1532 1468 * to reflect the true maximum sized cluster so that 1533 fail: 1469 * future cluster allocation requests can avoid reading 1534 brelse(bp); 1470 * the cylinder group map only to find no clusters. 1535 return (0); 1471 */ 1536 } 1472 lp = &cg_clustersum(cgp)[len - 1]; 1537 1473 for (i = len - 1; i > 0; i--) 1538 /* 1474 if (*lp-- > 0) 1539 * Determine whether an inode can be allocated. 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 13 1540 * 1605 } 1541 * Check to see if an inode is available, and if it is, 1606 printf("fs = %s\n", fs->fs_fsmnt); 1542 * allocate it using the following policy: 1607 panic("ffs_nodealloccg: block not in map"); 1543 * 1) allocate the requested inode. 1608 /* NOTREACHED */ 1544 * 2) allocate the next available inode after the requested 1609 gotit: 1545 * inode in the specified cylinder group. 1610 if (DOINGSOFTDEP(ITOV(ip))) 1546 */ 1611 softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref); 1547 static ufs2_daddr_t 1612 setbit(inosused, ipref); 1548 ffs_nodealloccg(ip, cg, ipref, mode) 1613 cgp->cg_cs.cs_nifree--; 1549 struct inode *ip; 1614 fs->fs_cstotal.cs_nifree--; 1550 int cg; 1615 fs->fs_cs(fs, cg).cs_nifree--; 1551 ufs2_daddr_t ipref; 1616 fs->fs_fmod = 1; 1552 int mode; 1617 if ((mode & IFMT) == IFDIR) { 1553 { 1618 cgp->cg_cs.cs_ndir++; 1554 struct fs *fs; 1619 fs->fs_cstotal.cs_ndir++; 1555 struct cg *cgp; 1620 fs->fs_cs(fs, cg).cs_ndir++; 1556 struct buf *bp, *ibp; 1621 } 1557 u_int8_t *inosused; 1622 /* 1558 struct ufs2_dinode *dp2; 1623 * Check to see if we need to initialize more inodes. 1559 int error, start, len, loc, map, i; 1624 */ 1560 1625 if (fs->fs_magic == FS_UFS2_MAGIC && 1561 fs = ip->i_fs; 1626 ipref + INOPB(fs) > cgp->cg_initediblk && 1562 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1627 cgp->cg_initediblk < cgp->cg_niblk) { 1563 return (0); 1628 ibp = getblk(ip->i_devvp, fsbtodb(fs, 1564 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 1629 ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)), 1565 (int)fs->fs_cgsize, NOCRED, &bp); 1630 (int)fs->fs_bsize, 0, 0, 0); 1566 if (error) { 1631 bzero(ibp->b_data, (int)fs->fs_bsize); 1567 brelse(bp); 1632 dp2 = (struct ufs2_dinode *)(ibp->b_data); 1568 return (0); 1633 for (i = 0; i < INOPB(fs); i++) { 1569 } 1634 dp2->di_gen = arc4random() / 2 + 1; 1570 cgp = (struct cg *)bp->b_data; 1635 dp2++; 1571 if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { 1636 } 1572 brelse(bp); 1637 bawrite(ibp); 1573 return (0); 1638 cgp->cg_initediblk += INOPB(fs); 1574 } 1639 } 1575 bp->b_xflags |= BX_BKGRDWRITE; 1640 if (fs->fs_active != 0) 1576 cgp->cg_old_time = cgp->cg_time = time_second; 1641 atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 1577 inosused = cg_inosused(cgp); 1642 bdwrite(bp); 1578 if (ipref) { 1643 return (cg * fs->fs_ipg + ipref); 1579 ipref %= fs->fs_ipg; 1644 } 1580 if (isclr(inosused, ipref)) 1645 1581 goto gotit; 1646 /* 1582 } 1647 * check if a block is free 1583 start = cgp->cg_irotor / NBBY; 1648 */ 1584 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 1649 static int 1585 loc = skpc(0xff, len, &inosused[start]); 1650 ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h) 1586 if (loc == 0) { 1651 { 1587 len = start + 1; 1652 1588 start = 0; 1653 switch ((int)fs->fs_frag) { 1589 loc = skpc(0xff, len, &inosused[0]); 1654 case 8: 1590 if (loc == 0) { 1655 return (cp[h] == 0); 1591 printf("cg = %d, irotor = %ld, fs = %s\n", 1656 case 4: 1592 cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 1657 return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); 1593 panic("ffs_nodealloccg: map corrupted"); 1658 case 2: 1594 /* NOTREACHED */ 1659 return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); 1595 } 1660 case 1: 1596 } 1661 return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); 1597 i = start + len - loc; 1662 default: 1598 map = inosused[i]; 1663 panic("ffs_isfreeblock"); 1599 ipref = i * NBBY; 1664 } 1600 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { 1665 return (0); 1601 if ((map & i) == 0) { 1666 } 1602 cgp->cg_irotor = ipref; 1667 1603 goto gotit; 1668 /* 1604 } 1669 * Free a block or fragment. 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 14 1670 * 1734 cgp->cg_old_time = cgp->cg_time = time_second; 1671 * The specified block or fragment is placed back in the 1735 cgbno = dtogd(fs, bno); 1672 * free map. If a fragment is deallocated, a possible 1736 blksfree = cg_blksfree(cgp); 1673 * block reassembly is checked. 1737 if (size == fs->fs_bsize) { 1674 */ 1738 fragno = fragstoblks(fs, cgbno); 1675 void 1739 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 1676 ffs_blkfree(fs, devvp, bno, size, inum) 1740 if (devvp->v_type != VCHR) { 1677 struct fs *fs; 1741 /* devvp is a snapshot */ 1678 struct vnode *devvp; 1742 brelse(bp); 1679 ufs2_daddr_t bno; 1743 return; 1680 long size; 1744 } 1681 ino_t inum; 1745 printf("dev = %s, block = %jd, fs = %s\n", 1682 { 1746 devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 1683 struct cg *cgp; 1747 panic("ffs_blkfree: freeing free block"); 1684 struct buf *bp; 1748 } 1685 ufs1_daddr_t fragno, cgbno; 1749 ffs_setblock(fs, blksfree, fragno); 1686 ufs2_daddr_t cgblkno; 1750 ffs_clusteracct(fs, cgp, fragno, 1); 1687 int i, cg, blk, frags, bbase; 1751 cgp->cg_cs.cs_nbfree++; 1688 u_int8_t *blksfree; 1752 fs->fs_cstotal.cs_nbfree++; 1689 dev_t dev; 1753 fs->fs_cs(fs, cg).cs_nbfree++; 1690 1754 } else { 1691 cg = dtog(fs, bno); 1755 bbase = cgbno - fragnum(fs, cgbno); 1692 if (devvp->v_type != VCHR) { 1756 /* 1693 /* devvp is a snapshot */ 1757 * decrement the counts associated with the old frags 1694 dev = VTOI(devvp)->i_devvp->v_rdev; 1758 */ 1695 cgblkno = fragstoblks(fs, cgtod(fs, cg)); 1759 blk = blkmap(fs, blksfree, bbase); 1696 } else { 1760 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 1697 /* devvp is a normal disk device */ 1761 /* 1698 dev = devvp->v_rdev; 1762 * deallocate the fragment 1699 cgblkno = fsbtodb(fs, cgtod(fs, cg)); 1763 */ 1700 ASSERT_VOP_LOCKED(devvp, "ffs_blkfree"); 1764 frags = numfrags(fs, size); 1701 if ((devvp->v_vflag & VV_COPYONWRITE) && 1765 for (i = 0; i < frags; i++) { 1702 ffs_snapblkfree(fs, devvp, bno, size, inum)) 1766 if (isset(blksfree, cgbno + i)) { 1703 return; 1767 printf("dev = %s, block = %jd, fs = %s\n", 1704 VOP_FREEBLKS(devvp, fsbtodb(fs, bno), size); 1768 devtoname(dev), (intmax_t)(bno + i), 1705 } 1769 fs->fs_fsmnt); 1706 #ifdef DIAGNOSTIC 1770 panic("ffs_blkfree: freeing free frag"); 1707 if (dev->si_mountpoint && 1771 } 1708 (dev->si_mountpoint->mnt_kern_flag & MNTK_SUSPENDED)) 1772 setbit(blksfree, cgbno + i); 1709 panic("ffs_blkfree: deallocation on suspended filesystem"); 1773 } 1710 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || 1774 cgp->cg_cs.cs_nffree += i; 1711 fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 1775 fs->fs_cstotal.cs_nffree += i; 1712 printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n" 1776 fs->fs_cs(fs, cg).cs_nffree += i; , 1777 /* 1713 devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 1778 * add back in counts associated with the new frags 1714 size, fs->fs_fsmnt); 1779 */ 1715 panic("ffs_blkfree: bad size"); 1780 blk = blkmap(fs, blksfree, bbase); 1716 } 1781 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 1717 #endif 1782 /* 1718 if ((u_int)bno >= fs->fs_size) { 1783 * if a complete block has been reassembled, account for it 1719 printf("bad block %jd, ino %lu\n", (intmax_t)bno, 1784 */ 1720 (u_long)inum); 1785 fragno = fragstoblks(fs, bbase); 1721 ffs_fserr(fs, inum, "bad block"); 1786 if (ffs_isblock(fs, blksfree, fragno)) { 1722 return; 1787 cgp->cg_cs.cs_nffree -= fs->fs_frag; 1723 } 1788 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 1724 if (bread(devvp, cgblkno, (int)fs->fs_cgsize, NOCRED, &bp)) { 1789 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 1725 brelse(bp); 1790 ffs_clusteracct(fs, cgp, fragno, 1); 1726 return; 1791 cgp->cg_cs.cs_nbfree++; 1727 } 1792 fs->fs_cstotal.cs_nbfree++; 1728 cgp = (struct cg *)bp->b_data; 1793 fs->fs_cs(fs, cg).cs_nbfree++; 1729 if (!cg_chkmagic(cgp)) { 1794 } 1730 brelse(bp); 1795 } 1731 return; 1796 fs->fs_fmod = 1; 1732 } 1797 if (fs->fs_active != 0) 1733 bp->b_xflags |= BX_BKGRDWRITE; 1798 atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 15 1799 bdwrite(bp); 1864 return (0); 1800 } 1865 } 1801 1866 return (ffs_freefile(VTOI(pvp)->i_fs, VTOI(pvp)->i_devvp, ino, mode)); 1802 #ifdef DIAGNOSTIC 1867 } 1803 /* 1868 1804 * Verify allocation of a block or fragment. Returns true if block or 1869 /* 1805 * fragment is allocated, false if it is free. 1870 * Do the actual free operation. 1806 */ 1871 * The specified inode is placed back in the free map. 1807 static int 1872 */ 1808 ffs_checkblk(ip, bno, size) 1873 int 1809 struct inode *ip; 1874 ffs_freefile(fs, devvp, ino, mode) 1810 ufs2_daddr_t bno; 1875 struct fs *fs; 1811 long size; 1876 struct vnode *devvp; 1812 { 1877 ino_t ino; 1813 struct fs *fs; 1878 int mode; 1814 struct cg *cgp; 1879 { 1815 struct buf *bp; 1880 struct cg *cgp; 1816 ufs1_daddr_t cgbno; 1881 struct buf *bp; 1817 int i, error, frags, free; 1882 ufs2_daddr_t cgbno; 1818 u_int8_t *blksfree; 1883 int error, cg; 1819 1884 u_int8_t *inosused; 1820 fs = ip->i_fs; 1885 dev_t dev; 1821 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 1886 1822 printf("bsize = %ld, size = %ld, fs = %s\n", 1887 cg = ino_to_cg(fs, ino); 1823 (long)fs->fs_bsize, size, fs->fs_fsmnt); 1888 if (devvp->v_type != VCHR) { 1824 panic("ffs_checkblk: bad size"); 1889 /* devvp is a snapshot */ 1825 } 1890 dev = VTOI(devvp)->i_devvp->v_rdev; 1826 if ((u_int)bno >= fs->fs_size) 1891 cgbno = fragstoblks(fs, cgtod(fs, cg)); 1827 panic("ffs_checkblk: bad block %jd", (intmax_t)bno); 1892 } else { 1828 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))), 1893 /* devvp is a normal disk device */ 1829 (int)fs->fs_cgsize, NOCRED, &bp); 1894 dev = devvp->v_rdev; 1830 if (error) 1895 cgbno = fsbtodb(fs, cgtod(fs, cg)); 1831 panic("ffs_checkblk: cg bread failed"); 1896 } 1832 cgp = (struct cg *)bp->b_data; 1897 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 1833 if (!cg_chkmagic(cgp)) 1898 panic("ffs_freefile: range: dev = %s, ino = %lu, fs = %s", 1834 panic("ffs_checkblk: cg magic mismatch"); 1899 devtoname(dev), (u_long)ino, fs->fs_fsmnt); 1835 bp->b_xflags |= BX_BKGRDWRITE; 1900 if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) { 1836 blksfree = cg_blksfree(cgp); 1901 brelse(bp); 1837 cgbno = dtogd(fs, bno); 1902 return (error); 1838 if (size == fs->fs_bsize) { 1903 } 1839 free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 1904 cgp = (struct cg *)bp->b_data; 1840 } else { 1905 if (!cg_chkmagic(cgp)) { 1841 frags = numfrags(fs, size); 1906 brelse(bp); 1842 for (free = 0, i = 0; i < frags; i++) 1907 return (0); 1843 if (isset(blksfree, cgbno + i)) 1908 } 1844 free++; 1909 bp->b_xflags |= BX_BKGRDWRITE; 1845 if (free != 0 && free != frags) 1910 cgp->cg_old_time = cgp->cg_time = time_second; 1846 panic("ffs_checkblk: partially free fragment"); 1911 inosused = cg_inosused(cgp); 1847 } 1912 ino %= fs->fs_ipg; 1848 brelse(bp); 1913 if (isclr(inosused, ino)) { 1849 return (!free); 1914 printf("dev = %s, ino = %lu, fs = %s\n", devtoname(dev), 1850 } 1915 (u_long)ino + cg * fs->fs_ipg, fs->fs_fsmnt); 1851 #endif /* DIAGNOSTIC */ 1916 if (fs->fs_ronly == 0) 1852 1917 panic("ffs_freefile: freeing free inode"); 1853 /* 1918 } 1854 * Free an inode. 1919 clrbit(inosused, ino); 1855 */ 1920 if (ino < cgp->cg_irotor) 1856 int 1921 cgp->cg_irotor = ino; 1857 ffs_vfree(pvp, ino, mode) 1922 cgp->cg_cs.cs_nifree++; 1858 struct vnode *pvp; 1923 fs->fs_cstotal.cs_nifree++; 1859 ino_t ino; 1924 fs->fs_cs(fs, cg).cs_nifree++; 1860 int mode; 1925 if ((mode & IFMT) == IFDIR) { 1861 { 1926 cgp->cg_cs.cs_ndir--; 1862 if (DOINGSOFTDEP(pvp)) { 1927 fs->fs_cstotal.cs_ndir--; 1863 softdep_freefile(pvp, ino, mode); 1928 fs->fs_cs(fs, cg).cs_ndir--; 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 16 1929 } 1994 u_int8_t *blksfree; 1930 fs->fs_fmod = 1; 1995 1931 if (fs->fs_active != 0) 1996 /* 1932 atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 1997 * find the fragment by searching through the free block 1933 bdwrite(bp); 1998 * map for an appropriate bit pattern 1934 return (0); 1999 */ 1935 } 2000 if (bpref) 1936 2001 start = dtogd(fs, bpref) / NBBY; 1937 /* 2002 else 1938 * Check to see if a file is free. 2003 start = cgp->cg_frotor / NBBY; 1939 */ 2004 blksfree = cg_blksfree(cgp); 1940 int 2005 len = howmany(fs->fs_fpg, NBBY) - start; 1941 ffs_checkfreefile(fs, devvp, ino) 2006 loc = scanc((u_int)len, (u_char *)&blksfree[start], 1942 struct fs *fs; 2007 (u_char *)fragtbl[fs->fs_frag], 1943 struct vnode *devvp; 2008 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 1944 ino_t ino; 2009 if (loc == 0) { 1945 { 2010 len = start + 1; 1946 struct cg *cgp; 2011 start = 0; 1947 struct buf *bp; 2012 loc = scanc((u_int)len, (u_char *)&blksfree[0], 1948 ufs2_daddr_t cgbno; 2013 (u_char *)fragtbl[fs->fs_frag], 1949 int ret, cg; 2014 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 1950 u_int8_t *inosused; 2015 if (loc == 0) { 1951 2016 printf("start = %d, len = %d, fs = %s\n", 1952 cg = ino_to_cg(fs, ino); 2017 start, len, fs->fs_fsmnt); 1953 if (devvp->v_type != VCHR) { 2018 panic("ffs_alloccg: map corrupted"); 1954 /* devvp is a snapshot */ 2019 /* NOTREACHED */ 1955 cgbno = fragstoblks(fs, cgtod(fs, cg)); 2020 } 1956 } else { 2021 } 1957 /* devvp is a normal disk device */ 2022 bno = (start + len - loc) * NBBY; 1958 cgbno = fsbtodb(fs, cgtod(fs, cg)); 2023 cgp->cg_frotor = bno; 1959 } 2024 /* 1960 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) 2025 * found the byte in the map 1961 return (1); 2026 * sift through the bits to find the selected frag 1962 if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp)) { 2027 */ 1963 brelse(bp); 2028 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 1964 return (1); 2029 blk = blkmap(fs, blksfree, bno); 1965 } 2030 blk <<= 1; 1966 cgp = (struct cg *)bp->b_data; 2031 field = around[allocsiz]; 1967 if (!cg_chkmagic(cgp)) { 2032 subfield = inside[allocsiz]; 1968 brelse(bp); 2033 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 1969 return (1); 2034 if ((blk & field) == subfield) 1970 } 2035 return (bno + pos); 1971 inosused = cg_inosused(cgp); 2036 field <<= 1; 1972 ino %= fs->fs_ipg; 2037 subfield <<= 1; 1973 ret = isclr(inosused, ino); 2038 } 1974 brelse(bp); 2039 } 1975 return (ret); 2040 printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); 1976 } 2041 panic("ffs_alloccg: block not in map"); 1977 2042 return (-1); 1978 /* 2043 } 1979 * Find a block of the specified size in the specified cylinder group. 2044 1980 * 2045 /* 1981 * It is a panic if a request is made to find a block if none are 2046 * Update the cluster map because of an allocation or free. 1982 * available. 2047 * 1983 */ 2048 * Cnt == 1 means free; cnt == -1 means allocating. 1984 static ufs1_daddr_t 2049 */ 1985 ffs_mapsearch(fs, cgp, bpref, allocsiz) 2050 void 1986 struct fs *fs; 2051 ffs_clusteracct(fs, cgp, blkno, cnt) 1987 struct cg *cgp; 2052 struct fs *fs; 1988 ufs2_daddr_t bpref; 2053 struct cg *cgp; 1989 int allocsiz; 2054 ufs1_daddr_t blkno; 1990 { 2055 int cnt; 1991 ufs1_daddr_t bno; 2056 { 1992 int start, len, loc, i; 2057 int32_t *sump; 1993 int blk, field, subfield, pos; 2058 int32_t *lp; 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 17 2059 u_char *freemapp, *mapp; 2124 sump[back] -= cnt; 2060 int i, start, end, forw, back, map, bit; 2125 if (forw > 0) 2061 2126 sump[forw] -= cnt; 2062 if (fs->fs_contigsumsize <= 0) 2127 /* 2063 return; 2128 * Update cluster summary information. 2064 freemapp = cg_clustersfree(cgp); 2129 */ 2065 sump = cg_clustersum(cgp); 2130 lp = &sump[fs->fs_contigsumsize]; 2066 /* 2131 for (i = fs->fs_contigsumsize; i > 0; i--) 2067 * Allocate or clear the actual block. 2132 if (*lp-- > 0) 2068 */ 2133 break; 2069 if (cnt > 0) 2134 fs->fs_maxcluster[cgp->cg_cgx] = i; 2070 setbit(freemapp, blkno); 2135 } 2071 else 2136 2072 clrbit(freemapp, blkno); 2137 /* 2073 /* 2138 * Fserr prints the name of a filesystem with an error diagnostic. 2074 * Find the size of the cluster going forward. 2139 * 2075 */ 2140 * The form of the error message is: 2076 start = blkno + 1; 2141 * fs: error message 2077 end = start + fs->fs_contigsumsize; 2142 */ 2078 if (end >= cgp->cg_nclusterblks) 2143 static void 2079 end = cgp->cg_nclusterblks; 2144 ffs_fserr(fs, inum, cp) 2080 mapp = &freemapp[start / NBBY]; 2145 struct fs *fs; 2081 map = *mapp++; 2146 ino_t inum; 2082 bit = 1 << (start % NBBY); 2147 char *cp; 2083 for (i = start; i < end; i++) { 2148 { 2084 if ((map & bit) == 0) 2149 struct thread *td = curthread; /* XXX */ 2085 break; 2150 struct proc *p = td->td_proc; 2086 if ((i & (NBBY - 1)) != (NBBY - 1)) { 2151 2087 bit <<= 1; 2152 log(LOG_ERR, "pid %d (%s), uid %d inumber %d on %s: %s\n", 2088 } else { 2153 p->p_pid, p->p_comm, td->td_ucred->cr_uid, inum, fs->fs_fsmnt, cp) 2089 map = *mapp++; ; 2090 bit = 1; 2154 } 2091 } 2155 2092 } 2156 /* 2093 forw = i - start; 2157 * This function provides the capability for the fsck program to 2094 /* 2158 * update an active filesystem. Six operations are provided: 2095 * Find the size of the cluster going backward. 2159 * 2096 */ 2160 * adjrefcnt(inode, amt) - adjusts the reference count on the 2097 start = blkno - 1; 2161 * specified inode by the specified amount. Under normal 2098 end = start - fs->fs_contigsumsize; 2162 * operation the count should always go down. Decrementing 2099 if (end < 0) 2163 * the count to zero will cause the inode to be freed. 2100 end = -1; 2164 * adjblkcnt(inode, amt) - adjust the number of blocks used to 2101 mapp = &freemapp[start / NBBY]; 2165 * by the specifed amount. 2102 map = *mapp--; 2166 * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 2103 bit = 1 << (start % NBBY); 2167 * are marked as free. Inodes should never have to be marked 2104 for (i = start; i > end; i--) { 2168 * as in use. 2105 if ((map & bit) == 0) 2169 * freefiles(inode, count) - file inodes [inode..inode + count - 1] 2106 break; 2170 * are marked as free. Inodes should never have to be marked 2107 if ((i & (NBBY - 1)) != 0) { 2171 * as in use. 2108 bit >>= 1; 2172 * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 2109 } else { 2173 * are marked as free. Blocks should never have to be marked 2110 map = *mapp--; 2174 * as in use. 2111 bit = 1 << (NBBY - 1); 2175 * setflags(flags, set/clear) - the fs_flags field has the specified 2112 } 2176 * flags set (second parameter +1) or cleared (second parameter -1). 2113 } 2177 */ 2114 back = start - i; 2178 2115 /* 2179 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 2116 * Account for old cluster and the possibly new forward and 2180 2117 * back clusters. 2181 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT, 2118 */ 2182 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); 2119 i = back + forw + 1; 2183 2120 if (i > fs->fs_contigsumsize) 2184 SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, 2121 i = fs->fs_contigsumsize; 2185 sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); 2122 sump[i] += cnt; 2186 2123 if (back > 0) 2187 SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR, 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 18 2188 sysctl_ffs_fsck, "Free Range of Directory Inodes"); 2252 fs->fs_flags &= ˜(long)cmd.value; 2189 2253 break; 2190 SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR, 2254 2191 sysctl_ffs_fsck, "Free Range of File Inodes"); 2255 case FFS_ADJ_REFCNT: 2192 2256 #ifdef DEBUG 2193 SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR, 2257 if (fsckcmds) { 2194 sysctl_ffs_fsck, "Free Range of Blocks"); 2258 printf("%s: adjust inode %jd count by %jd\n", 2195 2259 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 2196 SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR, 2260 (intmax_t)cmd.size); 2197 sysctl_ffs_fsck, "Change Filesystem Flags"); 2261 } 2198 2262 #endif /* DEBUG */ 2199 #ifdef DEBUG 2263 if ((error = VFS_VGET(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp) 2200 static int fsckcmds = 0; )) 2201 SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); 2264 break; 2202 #endif /* DEBUG */ 2265 ip = VTOI(vp); 2203 2266 ip->i_nlink += cmd.size; 2204 static int 2267 DIP(ip, i_nlink) = ip->i_nlink; 2205 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 2268 ip->i_effnlink += cmd.size; 2206 { 2269 ip->i_flag |= IN_CHANGE; 2207 struct fsck_cmd cmd; 2270 if (DOINGSOFTDEP(vp)) 2208 struct ufsmount *ump; 2271 softdep_change_linkcnt(ip); 2209 struct vnode *vp; 2272 vput(vp); 2210 struct inode *ip; 2273 break; 2211 struct mount *mp; 2274 2212 struct fs *fs; 2275 case FFS_ADJ_BLKCNT: 2213 ufs2_daddr_t blkno; 2276 #ifdef DEBUG 2214 long blkcnt, blksize; 2277 if (fsckcmds) { 2215 struct file *fp; 2278 printf("%s: adjust inode %jd block count by %jd\n", 2216 int filetype, error; 2279 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 2217 2280 (intmax_t)cmd.size); 2218 if (req->newlen > sizeof cmd) 2281 } 2219 return (EBADRPC); 2282 #endif /* DEBUG */ 2220 if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0) 2283 if ((error = VFS_VGET(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp) 2221 return (error); )) 2222 if (cmd.version != FFS_CMD_VERSION) 2284 break; 2223 return (ERPCMISMATCH); 2285 ip = VTOI(vp); 2224 if ((error = getvnode(curproc->p_fd, cmd.handle, &fp)) != 0) 2286 DIP(ip, i_blocks) += cmd.size; 2225 return (error); 2287 ip->i_flag |= IN_CHANGE; 2226 vn_start_write(fp->f_data, &mp, V_WAIT); 2288 vput(vp); 2227 if (mp == 0 || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) 2289 break; { 2290 2228 vn_finished_write(mp); 2291 case FFS_DIR_FREE: 2229 fdrop(fp, curthread); 2292 filetype = IFDIR; 2230 return (EINVAL); 2293 /* fall through */ 2231 } 2294 2232 if (mp->mnt_flag & MNT_RDONLY) { 2295 case FFS_FILE_FREE: 2233 vn_finished_write(mp); 2296 #ifdef DEBUG 2234 fdrop(fp, curthread); 2297 if (fsckcmds) { 2235 return (EROFS); 2298 if (cmd.size == 1) 2236 } 2299 printf("%s: free %s inode %d\n", 2237 ump = VFSTOUFS(mp); 2300 mp->mnt_stat.f_mntonname, 2238 fs = ump->um_fs; 2301 filetype == IFDIR ? "directory" : "file", 2239 filetype = IFREG; 2302 (ino_t)cmd.value); 2240 2303 else 2241 switch (oidp->oid_number) { 2304 printf("%s: free %s inodes %d-%d\n", 2242 2305 mp->mnt_stat.f_mntonname, 2243 case FFS_SET_FLAGS: 2306 filetype == IFDIR ? "directory" : "file", 2244 #ifdef DEBUG 2307 (ino_t)cmd.value, 2245 if (fsckcmds) 2308 (ino_t)(cmd.value + cmd.size - 1)); 2246 printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 2309 } 2247 cmd.size > 0 ? "set" : "clear"); 2310 #endif /* DEBUG */ 2248 #endif /* DEBUG */ 2311 while (cmd.size > 0) { 2249 if (cmd.size > 0) 2312 if ((error = ffs_freefile(fs, ump->um_devvp, cmd.value 2250 fs->fs_flags |= (long)cmd.value; , 2251 else 2313 filetype))) 10/30/03 23:25:06 sys/ufs/ffs/ffs_alloc.c 19 2314 break; 2315 cmd.size -= 1; 2316 cmd.value += 1; 2317 } 2318 break; 2319 2320 case FFS_BLK_FREE: 2321 #ifdef DEBUG 2322 if (fsckcmds) { 2323 if (cmd.size == 1) 2324 printf("%s: free block %jd\n", 2325 mp->mnt_stat.f_mntonname, 2326 (intmax_t)cmd.value); 2327 else 2328 printf("%s: free blocks %jd-%jd\n", 2329 mp->mnt_stat.f_mntonname, 2330 (intmax_t)cmd.value, 2331 (intmax_t)cmd.value + cmd.size - 1); 2332 } 2333 #endif /* DEBUG */ 2334 blkno = cmd.value; 2335 blkcnt = cmd.size; 2336 blksize = fs->fs_frag - (blkno % fs->fs_frag); 2337 while (blkcnt > 0) { 2338 if (blksize > blkcnt) 2339 blksize = blkcnt; 2340 ffs_blkfree(fs, ump->um_devvp, blkno, 2341 blksize * fs->fs_fsize, ROOTINO); 2342 blkno += blksize; 2343 blkcnt -= blksize; 2344 blksize = fs->fs_frag; 2345 } 2346 break; 2347 2348 default: 2349 #ifdef DEBUG 2350 if (fsckcmds) { 2351 printf("Invalid request %d from fsck\n", 2352 oidp->oid_number); 2353 } 2354 #endif /* DEBUG */ 2355 error = EINVAL; 2356 break; 2357 2358 } 2359 fdrop(fp, curthread); 2360 vn_finished_write(mp); 2361 return (error); 2362 } 08/15/03 13:03:19 sys/ufs/ffs/ffs_balloc.c 1 1 /* 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 422 /* 3 * All rights reserved. 423 * Balloc defines the structure of file system storage 4 * 424 * by allocating the physical blocks on a device given 5 * This software was developed for the FreeBSD Project by Marshall 425 * the inode and the logical block number in a file. 6 * Kirk McKusick and Network Associates Laboratories, the Security 426 * This is the allocation strategy for UFS2. Above is 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 427 * the allocation strategy for UFS1. 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 428 */ 9 * research program 429 int 10 * 430 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, 11 * Copyright (c) 1982, 1986, 1989, 1993 431 struct ucred *cred, int flags, struct buf **bpp) 12 * The Regents of the University of California. All rights reserved. 432 { 13 * 433 struct inode *ip; 14 * Redistribution and use in source and binary forms, with or without 434 struct ufs2_dinode *dp; 15 * modification, are permitted provided that the following conditions 435 ufs_lbn_t lbn, lastlbn; 16 * are met: 436 struct fs *fs; 17 * 1. Redistributions of source code must retain the above copyright 437 struct buf *bp, *nbp; 18 * notice, this list of conditions and the following disclaimer. 438 struct indir indirs[NIADDR + 2]; 19 * 2. Redistributions in binary form must reproduce the above copyright 439 ufs2_daddr_t nb, newb, *bap, pref; 20 * notice, this list of conditions and the following disclaimer in the 440 ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; 21 * documentation and/or other materials provided with the distribution. 441 int deallocated, osize, nsize, num, i, error; 22 * 3. All advertising materials mentioning features or use of this software 442 int unwindidx = -1; 23 * must display the following acknowledgement: 443 struct thread *td = curthread; /* XXX */ 24 * This product includes software developed by the University of 444 25 * California, Berkeley and its contributors. 445 ip = VTOI(vp); 26 * 4. Neither the name of the University nor the names of its contributors 446 dp = ip->i_din2; 27 * may be used to endorse or promote products derived from this software 447 fs = ip->i_fs; 28 * without specific prior written permission. 448 lbn = lblkno(fs, startoffset); 29 * 449 size = blkoff(fs, startoffset) + size; 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 450 if (size > fs->fs_bsize) 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 451 panic("ffs_balloc_ufs2: blk too big"); 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 452 *bpp = NULL; 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 453 if (lbn < 0) 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 454 return (EFBIG); 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 455 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 456 /* 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 457 * Check for allocating external data. 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 458 */ 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 459 if (flags & IO_EXT) { 40 * SUCH DAMAGE. 460 if (lbn >= NXADDR) 41 * 461 return (EFBIG); 42 * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 462 /* 43 */ 463 * If the next write will extend the data into a new block, 44 464 * and the data is currently composed of a fragment 45 #include 465 * this fragment has to be extended to be a full block. 46 __FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_balloc.c,v 1.43 2003/08/15 20:03:19 ph 466 */ k Exp $"); 467 lastlbn = lblkno(fs, dp->di_extsize); 47 468 if (lastlbn < lbn) { 48 #include 469 nb = lastlbn; 49 #include 470 osize = sblksize(fs, dp->di_extsize, nb); 50 #include 471 if (osize < fs->fs_bsize && osize > 0) { 51 #include 472 error = ffs_realloccg(ip, -1 - nb, 52 #include 473 dp->di_extb[nb], 53 #include 474 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 54 #include 475 &dp->di_extb[0]), osize, 55 476 (int)fs->fs_bsize, cred, &bp); 56 #include 477 if (error) 57 #include 478 return (error); 58 #include 479 if (DOINGSOFTDEP(vp)) 59 #include 480 softdep_setup_allocext(ip, nb, 60 #include 481 dbtofsb(fs, bp->b_blkno), 61 482 dp->di_extb[nb], 62 #include 483 fs->fs_bsize, osize, bp); 63 #include 484 dp->di_extsize = smalllblktosize(fs, nb + 1); 485 dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); 08/15/03 13:03:19 sys/ufs/ffs/ffs_balloc.c 2 486 bp->b_xflags |= BX_ALTDATA; 546 if (error) 487 ip->i_flag |= IN_CHANGE | IN_UPDATE; 547 return (error); 488 if (flags & IO_SYNC) 548 bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0); 489 bwrite(bp); 549 bp->b_blkno = fsbtodb(fs, newb); 490 else 550 bp->b_xflags |= BX_ALTDATA; 491 bawrite(bp); 551 if (flags & BA_CLRBUF) 492 } 552 vfs_bio_clrbuf(bp); 493 } 553 if (DOINGSOFTDEP(vp)) 494 /* 554 softdep_setup_allocext(ip, lbn, newb, 0, 495 * All blocks are direct blocks 555 nsize, 0, bp); 496 */ 556 } 497 if (flags & BA_METAONLY) 557 dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); 498 panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); 558 ip->i_flag |= IN_CHANGE | IN_UPDATE; 499 nb = dp->di_extb[lbn]; 559 *bpp = bp; 500 if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) 560 return (0); { 561 } 501 error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp) 562 /* ; 563 * If the next write will extend the file into a new block, 502 if (error) { 564 * and the file is currently composed of a fragment 503 brelse(bp); 565 * this fragment has to be extended to be a full block. 504 return (error); 566 */ 505 } 567 lastlbn = lblkno(fs, ip->i_size); 506 bp->b_blkno = fsbtodb(fs, nb); 568 if (lastlbn < NDADDR && lastlbn < lbn) { 507 bp->b_xflags |= BX_ALTDATA; 569 nb = lastlbn; 508 *bpp = bp; 570 osize = blksize(fs, ip, nb); 509 return (0); 571 if (osize < fs->fs_bsize && osize > 0) { 510 } 572 error = ffs_realloccg(ip, nb, dp->di_db[nb], 511 if (nb != 0) { 573 ffs_blkpref_ufs2(ip, lastlbn, (int)nb, 512 /* 574 &dp->di_db[0]), osize, (int)fs->fs_bsize, 513 * Consider need to reallocate a fragment. 575 cred, &bp); 514 */ 576 if (error) 515 osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); 577 return (error); 516 nsize = fragroundup(fs, size); 578 if (DOINGSOFTDEP(vp)) 517 if (nsize <= osize) { 579 softdep_setup_allocdirect(ip, nb, 518 error = bread(vp, -1 - lbn, osize, NOCRED, &bp 580 dbtofsb(fs, bp->b_blkno), ); 581 dp->di_db[nb], 519 if (error) { 582 fs->fs_bsize, osize, bp); 520 brelse(bp); 583 ip->i_size = smalllblktosize(fs, nb + 1); 521 return (error); 584 dp->di_size = ip->i_size; 522 } 585 dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); 523 bp->b_blkno = fsbtodb(fs, nb); 586 ip->i_flag |= IN_CHANGE | IN_UPDATE; 524 bp->b_xflags |= BX_ALTDATA; 587 if (flags & IO_SYNC) 525 } else { 588 bwrite(bp); 526 error = ffs_realloccg(ip, -1 - lbn, 589 else 527 dp->di_extb[lbn], 590 bawrite(bp); 528 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 591 } 529 &dp->di_extb[0]), osize, nsize, cred, &bp) 592 } ; 593 /* 530 if (error) 594 * The first NDADDR blocks are direct blocks 531 return (error); 595 */ 532 bp->b_xflags |= BX_ALTDATA; 596 if (lbn < NDADDR) { 533 if (DOINGSOFTDEP(vp)) 597 if (flags & BA_METAONLY) 534 softdep_setup_allocext(ip, lbn, 598 panic("ffs_balloc_ufs2: BA_METAONLY for direct block") 535 dbtofsb(fs, bp->b_blkno), nb, ; 536 nsize, osize, bp); 599 nb = dp->di_db[lbn]; 537 } 600 if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { 538 } else { 601 error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); 539 if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) 602 if (error) { 540 nsize = fragroundup(fs, size); 603 brelse(bp); 541 else 604 return (error); 542 nsize = fs->fs_bsize; 605 } 543 error = ffs_alloc(ip, lbn, 606 bp->b_blkno = fsbtodb(fs, nb); 544 ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0] 607 *bpp = bp; ), 608 return (0); 545 nsize, cred, &newb); 609 } 08/15/03 13:03:19 sys/ufs/ffs/ffs_balloc.c 3 610 if (nb != 0) { 673 allocblk = allociblk; 611 /* 674 if (nb == 0) { 612 * Consider need to reallocate a fragment. 675 pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0); 613 */ 676 if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, 614 osize = fragroundup(fs, blkoff(fs, ip->i_size)); 677 cred, &newb)) != 0) 615 nsize = fragroundup(fs, size); 678 return (error); 616 if (nsize <= osize) { 679 nb = newb; 617 error = bread(vp, lbn, osize, NOCRED, &bp); 680 *allocblk++ = nb; 618 if (error) { 681 bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0); 619 brelse(bp); 682 bp->b_blkno = fsbtodb(fs, nb); 620 return (error); 683 vfs_bio_clrbuf(bp); 621 } 684 if (DOINGSOFTDEP(vp)) { 622 bp->b_blkno = fsbtodb(fs, nb); 685 softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_of 623 } else { f, 624 error = ffs_realloccg(ip, lbn, dp->di_db[lbn], 686 newb, 0, fs->fs_bsize, 0, bp); 625 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 687 bdwrite(bp); 626 &dp->di_db[0]), osize, nsize, cred, &bp 688 } else { ); 689 /* 627 if (error) 690 * Write synchronously so that indirect blocks 628 return (error); 691 * never point at garbage. 629 if (DOINGSOFTDEP(vp)) 692 */ 630 softdep_setup_allocdirect(ip, lbn, 693 if (DOINGASYNC(vp)) 631 dbtofsb(fs, bp->b_blkno), nb, 694 bdwrite(bp); 632 nsize, osize, bp); 695 else if ((error = bwrite(bp)) != 0) 633 } 696 goto fail; 634 } else { 697 } 635 if (ip->i_size < smalllblktosize(fs, lbn + 1)) 698 allocib = &dp->di_ib[indirs[0].in_off]; 636 nsize = fragroundup(fs, size); 699 *allocib = nb; 637 else 700 ip->i_flag |= IN_CHANGE | IN_UPDATE; 638 nsize = fs->fs_bsize; 701 } 639 error = ffs_alloc(ip, lbn, 702 /* 640 ffs_blkpref_ufs2(ip, lbn, (int)lbn, 703 * Fetch through the indirect blocks, allocating as necessary. 641 &dp->di_db[0]), nsize, cred, &newb); 704 */ 642 if (error) 705 for (i = 1;;) { 643 return (error); 706 error = bread(vp, 644 bp = getblk(vp, lbn, nsize, 0, 0, 0); 707 indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); 645 bp->b_blkno = fsbtodb(fs, newb); 708 if (error) { 646 if (flags & BA_CLRBUF) 709 brelse(bp); 647 vfs_bio_clrbuf(bp); 710 goto fail; 648 if (DOINGSOFTDEP(vp)) 711 } 649 softdep_setup_allocdirect(ip, lbn, newb, 0, 712 bap = (ufs2_daddr_t *)bp->b_data; 650 nsize, 0, bp); 713 nb = bap[indirs[i].in_off]; 651 } 714 if (i == num) 652 dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); 715 break; 653 ip->i_flag |= IN_CHANGE | IN_UPDATE; 716 i += 1; 654 *bpp = bp; 717 if (nb != 0) { 655 return (0); 718 bqrelse(bp); 656 } 719 continue; 657 /* 720 } 658 * Determine the number of levels of indirection. 721 if (pref == 0) 659 */ 722 pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0) 660 pref = 0; ; 661 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 723 if ((error = 662 return(error); 724 ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) 663 #ifdef DIAGNOSTIC != 0) { 664 if (num < 1) 725 brelse(bp); 665 panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block") 726 goto fail; ; 727 } 666 #endif 728 nb = newb; 667 /* 729 *allocblk++ = nb; 668 * Fetch the first indirect block allocating if necessary. 730 nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); 669 */ 731 nbp->b_blkno = fsbtodb(fs, nb); 670 --num; 732 vfs_bio_clrbuf(nbp); 671 nb = dp->di_ib[indirs[0].in_off]; 733 if (DOINGSOFTDEP(vp)) { 672 allocib = NULL; 734 softdep_setup_allocindir_meta(nbp, ip, bp, 08/15/03 13:03:19 sys/ufs/ffs/ffs_balloc.c 4 735 indirs[i - 1].in_off, nb); 800 } 736 bdwrite(nbp); 801 *bpp = nbp; 737 } else { 802 return (0); 738 /* 803 } 739 * Write synchronously so that indirect blocks 804 brelse(bp); 740 * never point at garbage. 805 /* 741 */ 806 * If requested clear invalid portions of the buffer. If we 742 if ((error = bwrite(nbp)) != 0) { 807 * have to do a read-before-write (typical if BA_CLRBUF is set), 743 brelse(bp); 808 * try to do some read-ahead in the sequential case to reduce 744 goto fail; 809 * the number of I/O transactions. 745 } 810 */ 746 } 811 if (flags & BA_CLRBUF) { 747 bap[indirs[i - 1].in_off] = nb; 812 int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; 748 if (allocib == NULL && unwindidx < 0) 813 if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) 749 unwindidx = i - 1; { 750 /* 814 error = cluster_read(vp, ip->i_size, lbn, 751 * If required, write synchronously, otherwise use 815 (int)fs->fs_bsize, NOCRED, 752 * delayed write. 816 MAXBSIZE, seqcount, &nbp); 753 */ 817 } else { 754 if (flags & IO_SYNC) { 818 error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp 755 bwrite(bp); ); 756 } else { 819 } 757 if (bp->b_bufsize == fs->fs_bsize) 820 if (error) { 758 bp->b_flags |= B_CLUSTEROK; 821 brelse(nbp); 759 bdwrite(bp); 822 goto fail; 760 } 823 } 761 } 824 } else { 762 /* 825 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); 763 * If asked only for the indirect block, then return it. 826 nbp->b_blkno = fsbtodb(fs, nb); 764 */ 827 } 765 if (flags & BA_METAONLY) { 828 *bpp = nbp; 766 *bpp = bp; 829 return (0); 767 return (0); 830 fail: 768 } 831 /* 769 /* 832 * If we have failed to allocate any blocks, simply return the error. 770 * Get the data block, allocating if necessary. 833 * This is the usual case and avoids the need to fsync the file. 771 */ 834 */ 772 if (nb == 0) { 835 if (allocblk == allociblk && allocib == NULL && unwindidx == -1) 773 pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]); 836 return (error); 774 error = ffs_alloc(ip, 837 /* 775 lbn, pref, (int)fs->fs_bsize, cred, &newb); 838 * If we have failed part way through block allocation, we 776 if (error) { 839 * have to deallocate any indirect blocks that we have allocated. 777 brelse(bp); 840 * We have to fsync the file before we start to get rid of all 778 goto fail; 841 * of its dependencies so that we do not leave them dangling. 779 } 842 * We have to sync it at the end so that the soft updates code 780 nb = newb; 843 * does not find any untracked changes. Although this is really 781 *allocblk++ = nb; 844 * slow, running out of disk space is not expected to be a common 782 nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0); 845 * occurence. The error return from fsync is ignored as we already 783 nbp->b_blkno = fsbtodb(fs, nb); 846 * have an error to return to the user. 784 if (flags & BA_CLRBUF) 847 */ 785 vfs_bio_clrbuf(nbp); 848 (void) VOP_FSYNC(vp, cred, MNT_WAIT, td); 786 if (DOINGSOFTDEP(vp)) 849 for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) { 787 softdep_setup_allocindir_page(ip, lbn, bp, 850 ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number 788 indirs[i].in_off, nb, 0, nbp); ); 789 bap[indirs[i].in_off] = nb; 851 deallocated += fs->fs_bsize; 790 /* 852 } 791 * If required, write synchronously, otherwise use 853 if (allocib != NULL) { 792 * delayed write. 854 *allocib = 0; 793 */ 855 } else if (unwindidx >= 0) { 794 if (flags & IO_SYNC) { 856 int r; 795 bwrite(bp); 857 796 } else { 858 r = bread(vp, indirs[unwindidx].in_lbn, 797 if (bp->b_bufsize == fs->fs_bsize) 859 (int)fs->fs_bsize, NOCRED, &bp); 798 bp->b_flags |= B_CLUSTEROK; 860 if (r) { 799 bdwrite(bp); 861 panic("Could not unwind indirect block, error %d", r); 08/15/03 13:03:19 sys/ufs/ffs/ffs_balloc.c 5 862 brelse(bp); 863 } else { 864 bap = (ufs2_daddr_t *)bp->b_data; 865 bap[indirs[unwindidx].in_off] = 0; 866 if (flags & IO_SYNC) { 867 bwrite(bp); 868 } else { 869 if (bp->b_bufsize == fs->fs_bsize) 870 bp->b_flags |= B_CLUSTEROK; 871 bdwrite(bp); 872 } 873 } 874 } 875 if (deallocated) { 876 #ifdef QUOTA 877 /* 878 * Restore user’s disk quota because allocation failed. 879 */ 880 (void) chkdq(ip, -btodb(deallocated), cred, FORCE); 881 #endif 882 dp->di_blocks -= btodb(deallocated); 883 ip->i_flag |= IN_CHANGE | IN_UPDATE; 884 } 885 (void) VOP_FSYNC(vp, cred, MNT_WAIT, td); 886 return (error); 887 } 10/04/03 13:38:32 sys/ufs/ffs/ffs_vnops.c 1 1 /* 585 struct buf *bp; 2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. 586 struct thread *td; 3 * All rights reserved. 587 ufs_lbn_t lbn; 4 * 588 off_t osize; 5 * This software was developed for the FreeBSD Project by Marshall 589 int seqcount; 6 * Kirk McKusick and Network Associates Laboratories, the Security 590 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 591 vm_object_t object; 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 592 9 * research program 593 vp = ap->a_vp; 10 * 594 uio = ap->a_uio; 11 * Copyright (c) 1982, 1986, 1989, 1993 595 ioflag = ap->a_ioflag; 12 * The Regents of the University of California. All rights reserved. 596 if (ap->a_ioflag & IO_EXT) 13 * 597 #ifdef notyet 14 * Redistribution and use in source and binary forms, with or without 598 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 15 * modification, are permitted provided that the following conditions 599 #else 16 * are met: 600 panic("ffs_read+IO_EXT"); 17 * 1. Redistributions of source code must retain the above copyright 601 #endif 18 * notice, this list of conditions and the following disclaimer. 602 19 * 2. Redistributions in binary form must reproduce the above copyright 603 GIANT_REQUIRED; 20 * notice, this list of conditions and the following disclaimer in the 604 21 * documentation and/or other materials provided with the distribution. 605 extended = 0; 22 * 3. All advertising materials mentioning features or use of this software 606 seqcount = ap->a_ioflag >> 16; 23 * must display the following acknowledgement: 607 ip = VTOI(vp); 24 * This product includes software developed by the University of 608 25 * California, Berkeley and its contributors. 609 object = vp->v_object; 26 * 4. Neither the name of the University nor the names of its contributors 610 if (object) { 27 * may be used to endorse or promote products derived from this software 611 vm_object_reference(object); 28 * without specific prior written permission. 612 } 29 * 613 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 614 #ifdef DIAGNOSTIC 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 615 if (uio->uio_rw != UIO_WRITE) 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 616 panic("ffswrite: mode"); 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 617 #endif 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 618 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 619 switch (vp->v_type) { 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 620 case VREG: 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 621 if (ioflag & IO_APPEND) 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 622 uio->uio_offset = ip->i_size; 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 623 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { 40 * SUCH DAMAGE. 624 if (object) { 41 * 625 VM_OBJECT_LOCK(object); 42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 626 vm_object_vndeallocate(object); 43 */ 627 } 44 628 return (EPERM); 45 #include 629 } 46 __FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_vnops.c,v 1.119 2003/10/04 20:38:32 al 630 /* FALLTHROUGH */ c Exp $"); 631 case VLNK: 632 break; 633 case VDIR: 569 /* 634 panic("ffswrite: dir write"); 570 * Vnode op for writing. 635 break; 571 */ 636 default: 572 static int 637 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type, 573 ffs_write(ap) 638 (int)uio->uio_offset, 574 struct vop_write_args /* { 639 (int)uio->uio_resid 575 struct vnode *a_vp; 640 ); 576 struct uio *a_uio; 641 } 577 int a_ioflag; 642 578 struct ucred *a_cred; 643 fs = ip->i_fs; 579 } */ *ap; 644 if (uio->uio_offset < 0 || 580 { 645 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) 581 struct vnode *vp; { 582 struct uio *uio; 646 if (object) { 583 struct inode *ip; 647 VM_OBJECT_LOCK(object); 584 struct fs *fs; 648 vm_object_vndeallocate(object); 10/04/03 13:38:32 sys/ufs/ffs/ffs_vnops.c 2 649 } 714 if (uio->uio_offset + xfersize > ip->i_size) { 650 return (EFBIG); 715 ip->i_size = uio->uio_offset + xfersize; 651 } 716 DIP(ip, i_size) = ip->i_size; 652 /* 717 extended = 1; 653 * Maybe this should be above the vnode op call, but so long as 718 } 654 * file servers have no limits, I don’t think it matters. 719 655 */ 720 size = blksize(fs, ip, lbn) - bp->b_resid; 656 td = uio->uio_td; 721 if (size < xfersize) 657 if (vp->v_type == VREG && td && 722 xfersize = size; 658 uio->uio_offset + uio->uio_resid > 723 659 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 724 error = 660 PROC_LOCK(td->td_proc); 725 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio 661 psignal(td->td_proc, SIGXFSZ); ); 662 PROC_UNLOCK(td->td_proc); 726 if ((ioflag & (IO_VMIO|IO_DIRECT)) && 663 if (object) { 727 (LIST_FIRST(&bp->b_dep) == NULL)) { 664 VM_OBJECT_LOCK(object); 728 bp->b_flags |= B_RELBUF; 665 vm_object_vndeallocate(object); 729 } 666 } 730 667 return (EFBIG); 731 /* 668 } 732 * If IO_SYNC each buffer is written synchronously. Otherwise 669 733 * if we have a severe page deficiency write the buffer 670 resid = uio->uio_resid; 734 * asynchronously. Otherwise try to cluster, and if that 671 osize = ip->i_size; 735 * doesn’t do it then either do an async write (if O_DIRECT), 672 if (seqcount > BA_SEQMAX) 736 * or a delayed write (if not). 673 flags = BA_SEQMAX << BA_SEQSHIFT; 737 */ 674 else 738 if (ioflag & IO_SYNC) { 675 flags = seqcount << BA_SEQSHIFT; 739 (void)bwrite(bp); 676 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 740 } else if (vm_page_count_severe() || 677 flags |= IO_SYNC; 741 buf_dirty_count_severe() || 678 742 (ioflag & IO_ASYNC)) { 679 for (error = 0; uio->uio_resid > 0;) { 743 bp->b_flags |= B_CLUSTEROK; 680 lbn = lblkno(fs, uio->uio_offset); 744 bawrite(bp); 681 blkoffset = blkoff(fs, uio->uio_offset); 745 } else if (xfersize + blkoffset == fs->fs_bsize) { 682 xfersize = fs->fs_bsize - blkoffset; 746 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 683 if (uio->uio_resid < xfersize) 747 bp->b_flags |= B_CLUSTEROK; 684 xfersize = uio->uio_resid; 748 cluster_write(bp, ip->i_size, seqcount); 685 749 } else { 686 if (uio->uio_offset + xfersize > ip->i_size) 750 bawrite(bp); 687 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 751 } 688 752 } else if (ioflag & IO_DIRECT) { 689 /* 753 bp->b_flags |= B_CLUSTEROK; 690 * We must perform a read-before-write if the transfer size 754 bawrite(bp); 691 * does not cover the entire buffer. 755 } else { 692 */ 756 bp->b_flags |= B_CLUSTEROK; 693 if (fs->fs_bsize > xfersize) 757 bdwrite(bp); 694 flags |= BA_CLRBUF; 758 } 695 else 759 if (error || xfersize == 0) 696 flags &= ˜BA_CLRBUF; 760 break; 697 /* XXX is uio->uio_offset the right thing here? */ 761 ip->i_flag |= IN_CHANGE | IN_UPDATE; 698 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 762 } 699 ap->a_cred, flags, &bp); 763 /* 700 if (error != 0) 764 * If we successfully wrote any data, and we are not the superuser 701 break; 765 * we clear the setuid and setgid bits as a precaution against 702 /* 766 * tampering. 703 * If the buffer is not valid we have to clear out any 767 */ 704 * garbage data from the pages instantiated for the buffer. 768 if (resid > uio->uio_resid && ap->a_cred && 705 * If we do not, a failed uiomove() during a write can leave 769 suser_cred(ap->a_cred, PRISON_ROOT)) { 706 * the prior contents of the pages exposed to a userland 770 ip->i_mode &= ˜(ISUID | ISGID); 707 * mmap(). XXX deal with uiomove() errors a better way. 771 DIP(ip, i_mode) = ip->i_mode; 708 */ 772 } 709 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 773 if (resid > uio->uio_resid) 710 vfs_bio_clrbuf(bp); 774 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); 711 if (ioflag & IO_DIRECT) 775 if (error) { 712 bp->b_flags |= B_DIRECT; 776 if (ioflag & IO_UNIT) { 713 777 (void)UFS_TRUNCATE(vp, osize, 10/04/03 13:38:32 sys/ufs/ffs/ffs_vnops.c 3 778 IO_NORMAL | (ioflag & IO_SYNC), 779 ap->a_cred, uio->uio_td); 780 uio->uio_offset -= resid - uio->uio_resid; 781 uio->uio_resid = resid; 782 } 783 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 784 error = UFS_UPDATE(vp, 1); 785 786 if (object) { 787 VM_OBJECT_LOCK(object); 788 vm_object_vndeallocate(object); 789 } 790 791 return (error); 792 } 11/15/03 23:08:27 sys/ufs/ffs/fs.h 1 1 /* 66 * room for the disk label and a bigger bootstrap, and for really piggy 2 * Copyright (c) 1982, 1986, 1993 67 * systems we check at 256K from the front if the first three fail. In 3 * The Regents of the University of California. All rights reserved. 68 * all cases the size of the superblock will be SBLOCKSIZE. All values are 4 * 69 * given in byte-offset form, so they do not imply a sector size. The 5 * Redistribution and use in source and binary forms, with or without 70 * SBLOCKSEARCH specifies the order in which the locations should be searched. 6 * modification, are permitted provided that the following conditions 71 */ 7 * are met: 72 #define SBLOCK_FLOPPY 0 8 * 1. Redistributions of source code must retain the above copyright 73 #define SBLOCK_UFS1 8192 9 * notice, this list of conditions and the following disclaimer. 74 #define SBLOCK_UFS2 65536 10 * 2. Redistributions in binary form must reproduce the above copyright 75 #define SBLOCK_PIGGY 262144 11 * notice, this list of conditions and the following disclaimer in the 76 #define SBLOCKSIZE 8192 12 * documentation and/or other materials provided with the distribution. 77 #define SBLOCKSEARCH \ 13 * 3. All advertising materials mentioning features or use of this software 78 { SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 } 14 * must display the following acknowledgement: 79 15 * This product includes software developed by the University of 80 /* 16 * California, Berkeley and its contributors. 81 * Max number of fragments per block. This value is NOT tweakable. 17 * 4. Neither the name of the University nor the names of its contributors 82 */ 18 * may be used to endorse or promote products derived from this software 83 #define MAXFRAG 8 19 * without specific prior written permission. 84 20 * 85 /* 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 86 * Addresses stored in inodes are capable of addressing fragments 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * of ‘blocks’. File system blocks of at most size MAXBSIZE can 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * be optionally broken into 2, 4, or 8 pieces, each of which is 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 89 * addressable; these pieces may be DEV_BSIZE, or some multiple of 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * a DEV_BSIZE unit. 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * Large files consist of exclusively large data blocks. To avoid 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * undue wasted disk space, the last data block of a small file may be 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * allocated as only as many fragments of a large block as are 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * necessary. The filesystem format retains only a single pointer 31 * SUCH DAMAGE. 96 * to such a fragment, which is a piece of a single large block that 32 * 97 * has been divided. The size of such a fragment is determinable from 33 * @(#)fs.h 8.13 (Berkeley) 3/21/95 98 * information in the inode, using the ‘‘blksize(fs, ip, lbn)’’ macro. 34 * $FreeBSD: src/sys/ufs/ffs/fs.h,v 1.40 2003/11/16 07:08:27 wes Exp $ 99 * 35 */ 100 * The filesystem records space availability at the fragment level; 36 101 * to determine block availability, aligned fragments are examined. 37 #ifndef _UFS_FFS_FS_H_ 102 */ 38 #define _UFS_FFS_FS_H_ 103 39 104 /* 40 /* 105 * MINBSIZE is the smallest allowable block size. 41 * Each disk drive contains some number of filesystems. 106 * In order to insure that it is possible to create files of size 42 * A filesystem consists of a number of cylinder groups. 107 * 2^32 with only two levels of indirection, MINBSIZE is set to 4096. 43 * Each cylinder group has inodes and data. 108 * MINBSIZE must be big enough to hold a cylinder group block, 44 * 109 * thus changes to (struct cg) must keep its size within MINBSIZE. 45 * A filesystem is described by its super-block, which in turn 110 * Note that super blocks are always of size SBSIZE, 46 * describes the cylinder groups. The super-block is critical 111 * and that both SBSIZE and MAXBSIZE must be >= MINBSIZE. 47 * data and is replicated in each cylinder group to protect against 112 */ 48 * catastrophic loss. This is done at ‘newfs’ time and the critical 113 #define MINBSIZE 4096 49 * super-block data does not change, so the copies need not be 114 50 * referenced further unless disaster strikes. 115 /* 51 * 116 * The path name on which the filesystem is mounted is maintained 52 * For filesystem fs, the offsets of the various blocks of interest 117 * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in 53 * are given in the super block as: 118 * the super block for this name. 54 * [fs->fs_sblkno] Super-block 119 */ 55 * [fs->fs_cblkno] Cylinder group block 120 #define MAXMNTLEN 468 56 * [fs->fs_iblkno] Inode blocks 121 57 * [fs->fs_dblkno] Data blocks 122 /* 58 * The beginning of cylinder group cg in fs, is given by 123 * The volume name for this filesystem is maintained in fs_volname. 59 * the ‘‘cgbase(fs, cg)’’ macro. 124 * MAXVOLLEN defines the length of the buffer allocated. 60 * 125 */ 61 * Depending on the architecture and the media, the superblock may 126 #define MAXVOLLEN 32 62 * reside in any one of four places. For tiny media where every block 127 63 * counts, it is placed at the very front of the partition. Historically, 128 /* 64 * UFS1 placed it 8K from the front to leave room for the disk label and 129 * There is a 128-byte region in the superblock reserved for in-core 65 * a small bootstrap. For UFS2 it got moved to 64K from the front to leave 130 * pointers to summary information. Originally this included an array 11/15/03 23:08:27 sys/ufs/ffs/fs.h 2 131 * of pointers to blocks of struct csum; now there are just a few 196 * snapshot. When the other snapshot is freed, the BLK_SNAP entries 132 * pointers and the remaining space is padded with fs_ocsp[]. 197 * are converted to BLK_NOCOPY. These are needed to allow fsck to 133 * 198 * identify blocks that are in use by other snapshots (which are 134 * NOCSPTRS determines the size of this padding. One pointer (fs_csp) 199 * expunged from this snapshot). 135 * is taken away to point to a contiguous array of struct csum for 200 */ 136 * all cylinder groups; a second (fs_maxcluster) points to an array 201 #define BLK_NOCOPY ((ufs2_daddr_t)(1)) 137 * of cluster sizes that is computed as cylinder groups are inspected, 202 #define BLK_SNAP ((ufs2_daddr_t)(2)) 138 * and the third points to an array that tracks the creation of new 203 139 * directories. A fourth pointer, fs_active, is used when creating 204 /* 140 * snapshots; it points to a bitmap of cylinder groups for which the 205 * Sysctl values for the fast filesystem. 141 * free-block bitmap has changed since the snapshot operation began. 206 */ 142 */ 207 #define FFS_ADJ_REFCNT 1 /* adjust inode reference count */ 143 #define NOCSPTRS ((128 / sizeof(void *)) - 4) 208 #define FFS_ADJ_BLKCNT 2 /* adjust inode used block count */ 144 209 #define FFS_BLK_FREE 3 /* free range of blocks in map */ 145 /* 210 #define FFS_DIR_FREE 4 /* free specified dir inodes in map */ 146 * A summary of contiguous blocks of various sizes is maintained 211 #define FFS_FILE_FREE 5 /* free specified file inodes in map * 147 * in each cylinder group. Normally this is set by the initial / 148 * value of fs_maxcontig. To conserve space, a maximum summary size 212 #define FFS_SET_FLAGS 6 /* set filesystem flags */ 149 * is set by FS_MAXCONTIG. 213 #define FFS_MAXID 7 /* number of valid ffs ids */ 150 */ 214 151 #define FS_MAXCONTIG 16 215 /* 152 216 * Command structure passed in to the filesystem to adjust filesystem values. 153 /* 217 */ 154 * MINFREE gives the minimum acceptable percentage of filesystem 218 #define FFS_CMD_VERSION 0x19790518 /* version ID */ 155 * blocks which may be free. If the freelist drops below this level 219 struct fsck_cmd { 156 * only the superuser may continue to allocate blocks. This may 220 int32_t version; /* version of command structure */ 157 * be set to 0 if no reserve of free blocks is deemed necessary, 221 int32_t handle; /* reference to filesystem to be changed */ 158 * however throughput drops by fifty percent if the filesystem 222 int64_t value; /* inode or block number to be affected */ 159 * is run at between 95% and 100% full; thus the minimum default 223 int64_t size; /* amount or range to be adjusted */ 160 * value of fs_minfree is 5%. However, to get good clustering 224 int64_t spare; /* reserved for future use */ 161 * performance, 10% is a better choice. hence we use 10% as our 225 }; 162 * default value. With 10% free space, fragmentation is not a 226 163 * problem, so we choose to optimize for time. 227 /* 164 */ 228 * Per cylinder group information; summarized in blocks allocated 165 #define MINFREE 8 229 * from first cylinder group data blocks. These blocks have to be 166 #define DEFAULTOPT FS_OPTTIME 230 * read in from fs_csaddr (size fs_cssize) in addition to the 167 231 * super block. 168 /* 232 */ 169 * Grigoriy Orlov has done some extensive work to fine 233 struct csum { 170 * tune the layout preferences for directories within a filesystem. 234 int32_t cs_ndir; /* number of directories */ 171 * His algorithm can be tuned by adjusting the following parameters 235 int32_t cs_nbfree; /* number of free blocks */ 172 * which tell the system the average file size and the average number 236 int32_t cs_nifree; /* number of free inodes */ 173 * of files per directory. These defaults are well selected for typical 237 int32_t cs_nffree; /* number of free frags */ 174 * filesystems, but may need to be tuned for odd cases like filesystems 238 }; 175 * being used for sqiud caches or news spools. 239 struct csum_total { 176 */ 240 int64_t cs_ndir; /* number of directories */ 177 #define AVFILESIZ 16384 /* expected average file size */ 241 int64_t cs_nbfree; /* number of free blocks */ 178 #define AFPDIR 64 /* expected number of files per directory */ 242 int64_t cs_nifree; /* number of free inodes */ 179 243 int64_t cs_nffree; /* number of free frags */ 180 /* 244 int64_t cs_numclusters; /* number of free clusters */ 181 * The maximum number of snapshot nodes that can be associated 245 int64_t cs_spare[3]; /* future expansion */ 182 * with each filesystem. This limit affects only the number of 246 }; 183 * snapshot files that can be recorded within the superblock so 247 184 * that they can be found when the filesystem is mounted. However, 248 /* 185 * maintaining too many will slow the filesystem performance, so 249 * Super block for an FFS filesystem. 186 * having this limit is a good idea. 250 */ 187 */ 251 struct fs { 188 #define FSMAXSNAP 20 252 int32_t fs_firstfield; /* historic filesystem linked list, */ 189 253 int32_t fs_unused_1; /* used for incore super blocks */ 190 /* 254 int32_t fs_sblkno; /* offset of super-block in filesys */ 191 * Used to identify special blocks in snapshots: 255 int32_t fs_cblkno; /* offset of cyl-block in filesys */ 192 * 256 int32_t fs_iblkno; /* offset of inode-blocks in filesys * 193 * BLK_NOCOPY - A block that was unallocated at the time the snapshot / 194 * was taken, hence does not need to be copied when written. 257 int32_t fs_dblkno; /* offset of first data after cg */ 195 * BLK_SNAP - A block held by another snapshot that is not needed by this 258 int32_t fs_old_cgoffset; /* cylinder group offset in cylinder * 11/15/03 23:08:27 sys/ufs/ffs/fs.h 3 / */ 259 int32_t fs_old_cgmask; /* used to calc mod fs_ntrak */ 319 u_int8_t *fs_contigdirs; /* # of contiguously allocated dirs */ 260 int32_t fs_old_time; /* last time written */ 320 struct csum *fs_csp; /* cg summary info buffer for fs_cs */ 261 int32_t fs_old_size; /* number of blocks in fs */ 321 int32_t *fs_maxcluster; /* max cluster in each cyl group */ 262 int32_t fs_old_dsize; /* number of data blocks in fs */ 322 u_int *fs_active; /* used by snapshots to track fs */ 263 int32_t fs_ncg; /* number of cylinder groups */ 323 int32_t fs_old_cpc; /* cyl per cycle in postbl */ 264 int32_t fs_bsize; /* size of basic blocks in fs */ 324 int32_t fs_maxbsize; /* maximum blocking factor permitted * 265 int32_t fs_fsize; /* size of frag blocks in fs */ / 266 int32_t fs_frag; /* number of frags in a block in fs */ 325 int64_t fs_sparecon64[17]; /* old rotation block list head */ 267 /* these are configuration parameters */ 326 int64_t fs_sblockloc; /* byte offset of standard superblock 268 int32_t fs_minfree; /* minimum percentage of free blocks * */ / 327 struct csum_total fs_cstotal; /* cylinder summary information */ 269 int32_t fs_old_rotdelay; /* num of ms for optimal next block */ 328 ufs_time_t fs_time; /* last time written */ 270 int32_t fs_old_rps; /* disk revolutions per second */ 329 int64_t fs_size; /* number of blocks in fs */ 271 /* these fields can be computed from the others */ 330 int64_t fs_dsize; /* number of data blocks in fs */ 272 int32_t fs_bmask; /* ‘‘blkoff’’ calc of blk offsets */ 331 ufs2_daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ 273 int32_t fs_fmask; /* ‘‘fragoff’’ calc of frag offsets */ 332 int64_t fs_pendingblocks; /* blocks in process of being freed */ 274 int32_t fs_bshift; /* ‘‘lblkno’’ calc of logical blkno */ 333 int32_t fs_pendinginodes; /* inodes in process of being freed */ 275 int32_t fs_fshift; /* ‘‘numfrags’’ calc number of frags * 334 int32_t fs_snapinum[FSMAXSNAP];/* list of snapshot inode numbers */ / 335 int32_t fs_avgfilesize; /* expected average file size */ 276 /* these are configuration parameters */ 336 int32_t fs_avgfpdir; /* expected # of files per directory * 277 int32_t fs_maxcontig; /* max number of contiguous blks */ / 278 int32_t fs_maxbpg; /* max number of blks per cyl group */ 337 int32_t fs_save_cgsize; /* save real cg size to use fs_bsize * 279 /* these fields can be computed from the others */ / 280 int32_t fs_fragshift; /* block to frag shift */ 338 int32_t fs_sparecon32[26]; /* reserved for future constants */ 281 int32_t fs_fsbtodb; /* fsbtodb and dbtofsb shift constant 339 int32_t fs_flags; /* see FS_ flags below */ */ 340 int32_t fs_contigsumsize; /* size of cluster summary array */ 282 int32_t fs_sbsize; /* actual size of super block */ 341 int32_t fs_maxsymlinklen; /* max length of an internal symlink * 283 int32_t fs_spare1[2]; /* old fs_csmask */ / 284 /* old fs_csshift */ 342 int32_t fs_old_inodefmt; /* format of on-disk inodes */ 285 int32_t fs_nindir; /* value of NINDIR */ 343 u_int64_t fs_maxfilesize; /* maximum representable file size */ 286 int32_t fs_inopb; /* value of INOPB */ 344 int64_t fs_qbmask; /* ˜fs_bmask for use with 64-bit size 287 int32_t fs_old_nspf; /* value of NSPF */ */ 288 /* yet another configuration parameter */ 345 int64_t fs_qfmask; /* ˜fs_fmask for use with 64-bit size 289 int32_t fs_optim; /* optimization preference, see below */ */ 346 int32_t fs_state; /* validate fs_clean field */ 290 int32_t fs_old_npsect; /* # sectors/track including spares */ 347 int32_t fs_old_postblformat; /* format of positional layout tables 291 int32_t fs_old_interleave; /* hardware sector interleave */ */ 292 int32_t fs_old_trackskew; /* sector 0 skew, per track */ 348 int32_t fs_old_nrpos; /* number of rotational positions */ 293 int32_t fs_id[2]; /* unique filesystem id */ 349 int32_t fs_spare5[2]; /* old fs_postbloff */ 294 /* sizes determined by number of cylinder groups and their sizes */ 350 /* old fs_rotbloff */ 295 int32_t fs_old_csaddr; /* blk addr of cyl grp summary area */ 351 int32_t fs_magic; /* magic number */ 296 int32_t fs_cssize; /* size of cyl grp summary area */ 352 }; 297 int32_t fs_cgsize; /* cylinder group size */ 353 298 int32_t fs_spare2; /* old fs_ntrak */ 354 /* Sanity checking. */ 299 int32_t fs_old_nsect; /* sectors per track */ 355 #ifdef CTASSERT 300 int32_t fs_old_spc; /* sectors per cylinder */ 356 CTASSERT(sizeof(struct fs) == 1376); 301 int32_t fs_old_ncyl; /* cylinders in filesystem */ 357 #endif 302 int32_t fs_old_cpg; /* cylinders per group */ 358 303 int32_t fs_ipg; /* inodes per group */ 359 /* 304 int32_t fs_fpg; /* blocks per group * fs_frag */ 360 * Filesystem identification 305 /* this data must be re-computed after crashes */ 361 */ 306 struct csum fs_old_cstotal; /* cylinder summary information */ 362 #define FS_UFS1_MAGIC 0x011954 /* UFS1 fast filesystem magic number * 307 /* these fields are cleared at mount time */ / 308 int8_t fs_fmod; /* super block modified flag */ 363 #define FS_UFS2_MAGIC 0x19540119 /* UFS2 fast filesystem magic number * 309 int8_t fs_clean; /* filesystem is clean flag */ / 310 int8_t fs_ronly; /* mounted read-only flag */ 364 #define FS_BAD2_MAGIC 0x19960408 /* UFS2 incomplete newfs magic number 311 int8_t fs_old_flags; /* old FS_ flags */ */ 312 u_char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ 365 #define FS_OKAY 0x7c269d38 /* superblock checksum */ 313 u_char fs_volname[MAXVOLLEN]; /* volume name */ 366 #define FS_42INODEFMT -1 /* 4.2BSD inode format */ 314 u_int64_t fs_swuid; /* system-wide uid */ 367 #define FS_44INODEFMT 2 /* 4.4BSD inode format */ 315 int32_t fs_pad; /* due to alignment of fs_swuid */ 368 316 /* these fields retain the current block allocation info */ 369 /* 317 int32_t fs_cgrotor; /* last cg searched */ 370 * Preference for optimization. 318 void *fs_ocsp[NOCSPTRS]; /* padding; was list of fs_cs buffers 371 */ 11/15/03 23:08:27 sys/ufs/ffs/fs.h 4 372 #define FS_OPTTIME 0 /* minimize allocation time */ 437 /* 373 #define FS_OPTSPACE 1 /* minimize disk fragmentation */ 438 * Convert cylinder group to base address of its global summary info. 374 439 */ 375 /* 440 #define fs_cs(fs, indx) fs_csp[indx] 376 * Filesystem flags. 441 377 * 442 /* 378 * The FS_UNCLEAN flag is set by the kernel when the filesystem was 443 * Cylinder group block for a filesystem. 379 * mounted with fs_clean set to zero. The FS_DOSOFTDEP flag indicates 444 */ 380 * that the filesystem should be managed by the soft updates code. 445 #define CG_MAGIC 0x090255 381 * Note that the FS_NEEDSFSCK flag is set and cleared only by the 446 struct cg { 382 * fsck utility. It is set when background fsck finds an unexpected 447 int32_t cg_firstfield; /* historic cyl groups linked list */ 383 * inconsistency which requires a traditional foreground fsck to be 448 int32_t cg_magic; /* magic number */ 384 * run. Such inconsistencies should only be found after an uncorrectable 449 int32_t cg_old_time; /* time last written */ 385 * disk error. A foreground fsck will clear the FS_NEEDSFSCK flag when 450 int32_t cg_cgx; /* we are the cgx’th cylinder group */ 386 * it has successfully cleaned up the filesystem. The kernel uses this 451 int16_t cg_old_ncyl; /* number of cyl’s this cg */ 387 * flag to enforce that inconsistent filesystems be mounted read-only. 452 int16_t cg_old_niblk; /* number of inode blocks this cg */ 388 * The FS_INDEXDIRS flag when set indicates that the kernel maintains 453 int32_t cg_ndblk; /* number of data blocks this cg */ 389 * on-disk auxiliary indexes (such as B-trees) for speeding directory 454 struct csum cg_cs; /* cylinder summary information */ 390 * accesses. Kernels that do not support auxiliary indicies clear the 455 int32_t cg_rotor; /* position of last used block */ 391 * flag to indicate that the indicies need to be rebuilt (by fsck) before 456 int32_t cg_frotor; /* position of last used frag */ 392 * they can be used. 457 int32_t cg_irotor; /* position of last used inode */ 393 * 458 int32_t cg_frsum[MAXFRAG]; /* counts of available frags */ 394 * FS_ACLS indicates that ACLs are administratively enabled for the 459 int32_t cg_old_btotoff; /* (int32) block totals per cylinder * 395 * file system, so they should be loaded from extended attributes, / 396 * observed for access control purposes, and be administered by object 460 int32_t cg_old_boff; /* (u_int16) free block positions */ 397 * owners. FS_MULTILABEL indicates that the TrustedBSD MAC Framework 461 int32_t cg_iusedoff; /* (u_int8) used inode map */ 398 * should attempt to back MAC labels into extended attributes on the 462 int32_t cg_freeoff; /* (u_int8) free block map */ 399 * file system rather than maintain a single mount label for all 463 int32_t cg_nextfreeoff; /* (u_int8) next available space */ 400 * objects. 464 int32_t cg_clustersumoff; /* (u_int32) counts of avail clusters 401 */ */ 402 #define FS_UNCLEAN 0x01 /* filesystem not clean at mount */ 465 int32_t cg_clusteroff; /* (u_int8) free cluster map */ 403 #define FS_DOSOFTDEP 0x02 /* filesystem using soft dependencies */ 466 int32_t cg_nclusterblks; /* number of clusters this cg */ 404 #define FS_NEEDSFSCK 0x04 /* filesystem needs sync fsck before mount */ 467 int32_t cg_niblk; /* number of inode blocks this cg */ 405 #define FS_INDEXDIRS 0x08 /* kernel supports indexed directories */ 468 int32_t cg_initediblk; /* last initialized inode */ 406 #define FS_ACLS 0x10 /* file system has ACLs enabled */ 469 int32_t cg_sparecon32[3]; /* reserved for future use */ 407 #define FS_MULTILABEL 0x20 /* file system is MAC multi-label */ 470 ufs_time_t cg_time; /* time last written */ 408 #define FS_FLAGS_UPDATED 0x80 /* flags have been moved to new location */ 471 int64_t cg_sparecon64[3]; /* reserved for future use */ 409 472 u_int8_t cg_space[1]; /* space for cylinder group maps */ 410 /* 473 /* actually longer */ 411 * Macros to access bits in the fs_active array. 474 }; 412 */ 475 413 #define ACTIVECGNUM(fs, cg) ((fs)->fs_active[(cg) / (NBBY * sizeof(int))]) 476 /* 414 #define ACTIVECGOFF(cg) (1 << ((cg) % (NBBY * sizeof(int)))) 477 * Macros for access to cylinder group array structures 415 478 */ 416 /* 479 #define cg_chkmagic(cgp) ((cgp)->cg_magic == CG_MAGIC) 417 * The size of a cylinder group is calculated by CGSIZE. The maximum size 480 #define cg_inosused(cgp) \ 418 * is limited by the fact that cylinder groups are at most one block. 481 ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_iusedoff)) 419 * Its size is derived from the size of the maps maintained in the 482 #define cg_blksfree(cgp) \ 420 * cylinder group and the (struct cg) size. 483 ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_freeoff)) 421 */ 484 #define cg_clustersfree(cgp) \ 422 #define CGSIZE(fs) \ 485 ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_clusteroff)) 423 /* base cg */ (sizeof(struct cg) + sizeof(int32_t) + \ 486 #define cg_clustersum(cgp) \ 424 /* old btotoff */ (fs)->fs_old_cpg * sizeof(int32_t) + \ 487 ((int32_t *)((u_int8_t *)(cgp) + (cgp)->cg_clustersumoff)) 425 /* old boff */ (fs)->fs_old_cpg * sizeof(u_int16_t) + \ 488 426 /* inode map */ howmany((fs)->fs_ipg, NBBY) + \ 489 /* 427 /* block map */ howmany((fs)->fs_fpg, NBBY) +\ 490 * Turn filesystem block numbers into disk block addresses. 428 /* if present */ ((fs)->fs_contigsumsize <= 0 ? 0 : \ 491 * This maps filesystem blocks to device size blocks. 429 /* cluster sum */ (fs)->fs_contigsumsize * sizeof(int32_t) + \ 492 */ 430 /* cluster map */ howmany(fragstoblks(fs, (fs)->fs_fpg), NBBY))) 493 #define fsbtodb(fs, b) ((b) << (fs)->fs_fsbtodb) 431 494 #define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb) 432 /* 495 433 * The minimal number of cylinder groups that should be created. 496 /* 434 */ 497 * Cylinder group macros to locate things in cylinder groups. 435 #define MINCYLGRPS 4 498 * They calc filesystem addresses of cylinder group data structures. 436 499 */ 11/15/03 23:08:27 sys/ufs/ffs/fs.h 5 500 #define cgbase(fs, c) (((ufs2_daddr_t)(fs)->fs_fpg) * (c)) 560 ((frags) >> (fs)->fs_fragshift) 501 #define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */ 561 #define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \ 502 #define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk * 562 ((blks) << (fs)->fs_fragshift) / 563 #define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \ 503 #define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk * 564 ((fsb) & ((fs)->fs_frag - 1)) / 565 #define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \ 504 #define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */ 566 ((fsb) &˜ ((fs)->fs_frag - 1)) 505 #define cgstart(fs, c) \ 567 506 ((fs)->fs_magic == FS_UFS2_MAGIC ? cgbase(fs, c) : \ 568 /* 507 (cgbase(fs, c) + (fs)->fs_old_cgoffset * ((c) & ˜((fs)->fs_old_cgmask)) 569 * Determine the number of available frags given a )) 570 * percentage to hold in reserve. 508 571 */ 509 /* 572 #define freespace(fs, percentreserved) \ 510 * Macros for handling inode numbers: 573 (blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \ 511 * inode number to filesystem block offset. 574 (fs)->fs_cstotal.cs_nffree - \ 512 * inode number to cylinder group number. 575 (((off_t)((fs)->fs_dsize)) * (percentreserved) / 100)) 513 * inode number to filesystem block address. 576 514 */ 577 /* 515 #define ino_to_cg(fs, x) ((x) / (fs)->fs_ipg) 578 * Determining the size of a file block in the filesystem. 516 #define ino_to_fsba(fs, x) \ 579 */ 517 ((ufs2_daddr_t)(cgimin(fs, ino_to_cg(fs, x)) + \ 580 #define blksize(fs, ip, lbn) \ 518 (blkstofrags((fs), (((x) % (fs)->fs_ipg) / INOPB(fs)))))) 581 (((lbn) >= NDADDR || (ip)->i_size >= smalllblktosize(fs, (lbn) + 1)) \ 519 #define ino_to_fsbo(fs, x) ((x) % INOPB(fs)) 582 ? (fs)->fs_bsize \ 520 583 : (fragroundup(fs, blkoff(fs, (ip)->i_size)))) 521 /* 584 #define sblksize(fs, size, lbn) \ 522 * Give cylinder group number for a filesystem block. 585 (((lbn) >= NDADDR || (size) >= ((lbn) + 1) << (fs)->fs_bshift) \ 523 * Give cylinder group block number for a filesystem block. 586 ? (fs)->fs_bsize \ 524 */ 587 : (fragroundup(fs, blkoff(fs, (size))))) 525 #define dtog(fs, d) ((d) / (fs)->fs_fpg) 588 526 #define dtogd(fs, d) ((d) % (fs)->fs_fpg) 589 527 590 /* 528 /* 591 * Number of inodes in a secondary storage block/fragment. 529 * Extract the bits for a block from a map. 592 */ 530 * Compute the cylinder and rotational position of a cyl block addr. 593 #define INOPB(fs) ((fs)->fs_inopb) 531 */ 594 #define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift) 532 #define blkmap(fs, map, loc) \ 595 533 (((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag) 596 /* )) 597 * Number of indirects in a filesystem block. 534 598 */ 535 /* 599 #define NINDIR(fs) ((fs)->fs_nindir) 536 * The following macros optimize certain frequently calculated 600 537 * quantities by using shifts and masks in place of divisions 601 extern int inside[], around[]; 538 * modulos and multiplications. 602 extern u_char *fragtbl[]; 539 */ 603 540 #define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \ 604 #endif 541 ((loc) & (fs)->fs_qbmask) 542 #define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \ 543 ((loc) & (fs)->fs_qfmask) 544 #define lfragtosize(fs, frag) /* calculates ((off_t)frag * fs->fs_fsize) */ \ 545 (((off_t)(frag)) << (fs)->fs_fshift) 546 #define lblktosize(fs, blk) /* calculates ((off_t)blk * fs->fs_bsize) */ \ 547 (((off_t)(blk)) << (fs)->fs_bshift) 548 /* Use this only when ‘blk’ is known to be small, e.g., < NDADDR. */ 549 #define smalllblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \ 550 ((blk) << (fs)->fs_bshift) 551 #define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \ 552 ((loc) >> (fs)->fs_bshift) 553 #define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ 554 ((loc) >> (fs)->fs_fshift) 555 #define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \ 556 (((size) + (fs)->fs_qbmask) & (fs)->fs_bmask) 557 #define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \ 558 (((size) + (fs)->fs_qfmask) & (fs)->fs_fmask) 559 #define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \ 11/09/03 17:37:40 sys/vm/vm_glue.c 1 1 /* 65 2 * Copyright (c) 1991, 1993 583 /* 3 * The Regents of the University of California. All rights reserved. 584 * Implement fork’s actions on an address space. 4 * 585 * Here we arrange for the address space to be copied or referenced, 5 * This code is derived from software contributed to Berkeley by 586 * allocate a user struct (pcb and kernel stack), then call the 6 * The Mach project at Carnegie-Mellon University. 587 * machine-dependent layer to fill those in and make the new process 7 * 588 * ready to run. The new process is set up so that it returns directly 8 * Redistribution and use in source and binary forms, with or without 589 * to user mode to avoid stack copying and relocation problems. 9 * modification, are permitted provided that the following conditions 590 */ 10 * are met: 591 void 11 * 1. Redistributions of source code must retain the above copyright 592 vm_forkproc(td, p2, td2, flags) 12 * notice, this list of conditions and the following disclaimer. 593 struct thread *td; 13 * 2. Redistributions in binary form must reproduce the above copyright 594 struct proc *p2; 14 * notice, this list of conditions and the following disclaimer in the 595 struct thread *td2; 15 * documentation and/or other materials provided with the distribution. 596 int flags; 16 * 3. All advertising materials mentioning features or use of this software 597 { 17 * must display the following acknowledgement: 598 struct proc *p1 = td->td_proc; 18 * This product includes software developed by the University of 599 struct user *up; 19 * California, Berkeley and its contributors. 600 20 * 4. Neither the name of the University nor the names of its contributors 601 GIANT_REQUIRED; 21 * may be used to endorse or promote products derived from this software 602 22 * without specific prior written permission. 603 if ((flags & RFPROC) == 0) { 23 * 604 /* 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 605 * Divorce the memory, if it is shared, essentially 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 606 * this changes shared memory amongst threads, into 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 607 * COW locally. 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 608 */ 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 609 if ((flags & RFMEM) == 0) { 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 610 if (p1->p_vmspace->vm_refcnt > 1) { 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 611 vmspace_unshare(p1); 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 612 } 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 613 } 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 614 cpu_fork(td, p2, td2, flags); 34 * SUCH DAMAGE. 615 return; 35 * 616 } 36 * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 617 37 * 618 if (flags & RFMEM) { 38 * 619 p2->p_vmspace = p1->p_vmspace; 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 620 p1->p_vmspace->vm_refcnt++; 40 * All rights reserved. 621 } 41 * 622 42 * Permission to use, copy, modify and distribute this software and 623 while (vm_page_count_severe()) { 43 * its documentation is hereby granted, provided that both the copyright 624 VM_WAIT; 44 * notice and this permission notice appear in all copies of the 625 } 45 * software, derivative works or modified versions, and any portions 626 46 * thereof, and that both notices appear in supporting documentation. 627 if ((flags & RFMEM) == 0) { 47 * 628 p2->p_vmspace = vmspace_fork(p1->p_vmspace); 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 629 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 630 pmap_pinit2(vmspace_pmap(p2->p_vmspace)); 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 631 51 * 632 if (p1->p_vmspace->vm_shm) 52 * Carnegie Mellon requests users of this software to return to 633 shmfork(p1, p2); 53 * 634 } 54 * Software Distribution Coordinator or [email protected] 635 55 * School of 636 /* XXXKSE this is unsatisfactory but should be adequate */ 56 * Carnegie Mellon University 637 up = p2->p_uarea; 57 * Pittsburgh PA 15213-3890 638 MPASS(p2->p_sigacts != NULL); 58 * 639 59 * any improvements or extensions that they make and grant Carnegie the 640 /* 60 * rights to redistribute these changes. 641 * p_stats currently points at fields in the user struct 61 */ 642 * but not at &u, instead at p_addr. Copy parts of 62 643 * p_stats; zero the rest of p_stats (statistics). 63 #include 644 */ 64 __FBSDID("$FreeBSD: src/sys/vm/vm_glue.c,v 1.187 2003/11/10 01:37:40 alc Exp $ 645 p2->p_stats = &up->u_stats; "); 646 bzero(&up->u_stats.pstat_startzero, 11/09/03 17:37:40 sys/vm/vm_glue.c 2 647 (unsigned) ((caddr_t) &up->u_stats.pstat_endzero - 648 (caddr_t) &up->u_stats.pstat_startzero)); 649 bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, 650 ((caddr_t) &up->u_stats.pstat_endcopy - 651 (caddr_t) &up->u_stats.pstat_startcopy)); 652 653 /* 654 * cpu_fork will copy and update the pcb, set up the kernel stack, 655 * and make the child ready to run. 656 */ 657 cpu_fork(td, p2, td2, flags); 658 } 659 660 /* 661 * Called after process has been wait(2)’ed apon and is being reaped. 662 * The idea is to reclaim resources that we could not reclaim while 663 * the process was still executing. 664 */ 665 void 666 vm_waitproc(p) 667 struct proc *p; 668 { 669 670 GIANT_REQUIRED; 671 vmspace_exitfree(p); /* and clean-out the vmspace */ 672 } 11/19/03 10:48:45 sys/vm/vm_map.c 1 1 /* 66 * Virtual memory mapping module. 2 * Copyright (c) 1991, 1993 67 */ 3 * The Regents of the University of California. All rights reserved. 68 4 * 69 #include 5 * This code is derived from software contributed to Berkeley by 70 __FBSDID("$FreeBSD: src/sys/vm/vm_map.c,v 1.323 2003/11/19 18:48:45 alc Exp $" 6 * The Mach Operating System project at Carnegie-Mellon University. ); 7 * 71 8 * Redistribution and use in source and binary forms, with or without 72 #include 9 * modification, are permitted provided that the following conditions 73 #include 10 * are met: 74 #include 11 * 1. Redistributions of source code must retain the above copyright 75 #include 12 * notice, this list of conditions and the following disclaimer. 76 #include 13 * 2. Redistributions in binary form must reproduce the above copyright 77 #include 14 * notice, this list of conditions and the following disclaimer in the 78 #include 15 * documentation and/or other materials provided with the distribution. 79 #include 16 * 3. All advertising materials mentioning features or use of this software 80 #include 17 * must display the following acknowledgement: 81 #include 18 * This product includes software developed by the University of 82 #include 19 * California, Berkeley and its contributors. 83 #include 20 * 4. Neither the name of the University nor the names of its contributors 84 #include 21 * may be used to endorse or promote products derived from this software 85 22 * without specific prior written permission. 86 #include 23 * 87 #include 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ‘‘AS IS’’ AND 88 #include 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 89 #include 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 90 #include 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 91 #include 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 92 #include 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 93 #include 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 94 #include 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 95 #include 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 96 #include 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 97 34 * SUCH DAMAGE. 98 /* 35 * 99 * Virtual memory maps provide for the mapping, protection, 36 * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 100 * and sharing of virtual memory objects. In addition, 37 * 101 * this module provides for an efficient virtual copy of 38 * 102 * memory from one map to another. 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 103 * 40 * All rights reserved. 104 * Synchronization is required prior to most operations. 41 * 105 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 106 * Maps consist of an ordered doubly-linked list of simple 43 * 107 * entries; a single hint is used to speed up lookups. 44 * Permission to use, copy, modify and distribute this software and 108 * 45 * its documentation is hereby granted, provided that both the copyright 109 * Since portions of maps are specified by start/end addresses, 46 * notice and this permission notice appear in all copies of the 110 * which may not align with existing map entries, all 47 * software, derivative works or modified versions, and any portions 111 * routines merely "clip" entries to these start/end values. 48 * thereof, and that both notices appear in supporting documentation. 112 * [That is, an entry is split into two, bordering at a 49 * 113 * start or end value.] Note that these clippings may not 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 114 * always be necessary (as the two resulting entries are then 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 115 * not changed); however, the clipping is done for convenience. 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 116 * 53 * 117 * As mentioned above, virtual copy operations are performed 54 * Carnegie Mellon requests users of this software to return to 118 * by copying VM object references from one map to 55 * 119 * another, and then marking both regions as copy-on-write. 56 * Software Distribution Coordinator or [email protected] 120 */ 57 * School of Computer Science 121 58 * Carnegie Mellon University 122 /* 59 * Pittsburgh PA 15213-3890 123 * vm_map_startup: 60 * 124 * 61 * any improvements or extensions that they make and grant Carnegie the 125 * Initialize the vm_map module. Must be called before 62 * rights to redistribute these changes. 126 * any other vm_map routines. 63 */ 127 * 64 128 * Map and entry structures are allocated from the general 65 /* 129 * purpose memory pool with some exceptions: 11/19/03 10:48:45 sys/vm/vm_map.c 2 130 * 195 131 * - The kernel map and kmem submap are allocated statically. 196 static void 132 * - Kernel map entries are allocated out of a static pool. 197 vm_map_zfini(void *mem, int size) 133 * 198 { 134 * These restrictions are necessary since malloc() uses the 199 vm_map_t map; 135 * maps and requires map entries. 200 136 */ 201 map = (vm_map_t)mem; 137 202 mtx_destroy(&map->system_mtx); 138 static struct mtx map_sleep_mtx; 203 lockdestroy(&map->lock); 139 static uma_zone_t mapentzone; 204 } 140 static uma_zone_t kmapentzone; 205 141 static uma_zone_t mapzone; 206 static void 142 static uma_zone_t vmspace_zone; 207 vm_map_zinit(void *mem, int size) 143 static struct vm_object kmapentobj; 208 { 144 static void vmspace_zinit(void *mem, int size); 209 vm_map_t map; 145 static void vmspace_zfini(void *mem, int size); 210 146 static void vm_map_zinit(void *mem, int size); 211 map = (vm_map_t)mem; 147 static void vm_map_zfini(void *mem, int size); 212 map->nentries = 0; 148 static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max); 213 map->size = 0; 149 214 map->infork = 0; 150 #ifdef INVARIANTS 215 mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); 151 static void vm_map_zdtor(void *mem, int size, void *arg); 216 lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE); 152 static void vmspace_zdtor(void *mem, int size, void *arg); 217 } 153 #endif 218 154 219 #ifdef INVARIANTS 155 void 220 static void 156 vm_map_startup(void) 221 vmspace_zdtor(void *mem, int size, void *arg) 157 { 222 { 158 mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); 223 struct vmspace *vm; 159 mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL, 224 160 #ifdef INVARIANTS 225 vm = (struct vmspace *)mem; 161 vm_map_zdtor, 226 162 #else 227 vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg); 163 NULL, 228 } 164 #endif 229 static void 165 vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 230 vm_map_zdtor(void *mem, int size, void *arg) 166 uma_prealloc(mapzone, MAX_KMAP); 231 { 167 kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), 232 vm_map_t map; 168 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 233 169 UMA_ZONE_MTXCLASS | UMA_ZONE_VM); 234 map = (vm_map_t)mem; 170 uma_prealloc(kmapentzone, MAX_KMAPENT); 235 KASSERT(map->nentries == 0, 171 mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), 236 ("map %p nentries == %d on free.", 172 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 237 map, map->nentries)); 173 uma_prealloc(mapentzone, MAX_MAPENT); 238 KASSERT(map->size == 0, 174 } 239 ("map %p size == %lu on free.", 175 240 map, (unsigned long)map->size)); 176 static void 241 KASSERT(map->infork == 0, 177 vmspace_zfini(void *mem, int size) 242 ("map %p infork == %d on free.", 178 { 243 map, map->infork)); 179 struct vmspace *vm; 244 } 180 245 #endif /* INVARIANTS */ 181 vm = (struct vmspace *)mem; 246 182 247 /* 183 vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map)); 248 * Allocate a vmspace structure, including a vm_map and pmap, 184 } 249 * and initialize those structures. The refcnt is set to 1. 185 250 * The remaining fields must be initialized by the caller. 186 static void 251 */ 187 vmspace_zinit(void *mem, int size) 252 struct vmspace * 188 { 253 vmspace_alloc(min, max) 189 struct vmspace *vm; 254 vm_offset_t min, max; 190 255 { 191 vm = (struct vmspace *)mem; 256 struct vmspace *vm; 192 257 193 vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map)); 258 vm = uma_zalloc(vmspace_zone, M_WAITOK); 194 } 259 CTR1(KTR_VM, "vmspace_alloc: %p", vm); 11/19/03 10:48:45 sys/vm/vm_map.c 3 260 _vm_map_init(&vm->vm_map, min, max); 325 struct vmspace *vm; 261 pmap_pinit(vmspace_pmap(vm)); 326 262 vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */ 327 GIANT_REQUIRED; 263 vm->vm_refcnt = 1; 328 vm = p->p_vmspace; 264 vm->vm_shm = NULL; 329 p->p_vmspace = NULL; 265 vm->vm_exitingcnt = 0; 330 266 return (vm); 331 /* 267 } 332 * cleanup by parent process wait()ing on exiting child. vm_refcnt 268 333 * may not be 0 (e.g. fork() and child exits without exec()ing). 269 void 334 * exitingcnt may increment above 0 and drop back down to zero 270 vm_init2(void) 335 * several times while vm_refcnt is held non-zero. vm_refcnt 271 { 336 * may also increment above 0 and drop back down to zero several 272 uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count, 337 * times while vm_exitingcnt is held non-zero. 273 (VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8 + 338 * 274 maxproc * 2 + maxfiles); 339 * The last wait on the exiting child’s vmspace will clean up 275 vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, 340 * the remainder of the vmspace. 276 #ifdef INVARIANTS 341 */ 277 vmspace_zdtor, 342 if (--vm->vm_exitingcnt == 0 && vm->vm_refcnt == 0) 278 #else 343 vmspace_dofree(vm); 279 NULL, 344 } 280 #endif 345 281 vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 346 void 282 pmap_init2(); 347 _vm_map_lock(vm_map_t map, const char *file, int line) 283 } 348 { 284 349 int error; 285 static __inline void 350 286 vmspace_dofree(struct vmspace *vm) 351 if (map->system_map) 287 { 352 _mtx_lock_flags(&map->system_mtx, 0, file, line); 288 CTR1(KTR_VM, "vmspace_free: %p", vm); 353 else { 289 354 error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread); 290 /* 355 KASSERT(error == 0, ("%s: failed to get lock", __func__)); 291 * Make sure any SysV shm is freed, it might not have been in 356 } 292 * exit1(). 357 map->timestamp++; 293 */ 358 } 294 shmexit(vm); 359 295 360 void 296 /* 361 _vm_map_unlock(vm_map_t map, const char *file, int line) 297 * Lock the map, to wait out all other references to it. 362 { 298 * Delete all of the mappings and pages they hold, then call 363 299 * the pmap module to reclaim anything left. 364 if (map->system_map) 300 */ 365 _mtx_unlock_flags(&map->system_mtx, 0, file, line); 301 vm_map_lock(&vm->vm_map); 366 else 302 (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset, 367 lockmgr(&map->lock, LK_RELEASE, NULL, curthread); 303 vm->vm_map.max_offset); 368 } 304 vm_map_unlock(&vm->vm_map); 369 305 370 void 306 pmap_release(vmspace_pmap(vm)); 371 _vm_map_lock_read(vm_map_t map, const char *file, int line) 307 uma_zfree(vmspace_zone, vm); 372 { 308 } 373 int error; 309 374 310 void 375 if (map->system_map) 311 vmspace_free(struct vmspace *vm) 376 _mtx_lock_flags(&map->system_mtx, 0, file, line); 312 { 377 else { 313 GIANT_REQUIRED; 378 error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread); 314 379 KASSERT(error == 0, ("%s: failed to get lock", __func__)); 315 if (vm->vm_refcnt == 0) 380 } 316 panic("vmspace_free: attempt to free already freed vmspace"); 381 } 317 382 318 if (--vm->vm_refcnt == 0 && vm->vm_exitingcnt == 0) 383 void 319 vmspace_dofree(vm); 384 _vm_map_unlock_read(vm_map_t map, const char *file, int line) 320 } 385 { 321 386 322 void 387 if (map->system_map) 323 vmspace_exitfree(struct proc *p) 388 _mtx_unlock_flags(&map->system_mtx, 0, file, line); 324 { 389 else 11/19/03 10:48:45 sys/vm/vm_map.c 4 390 lockmgr(&map->lock, LK_RELEASE, NULL, curthread); 455 } 391 } 456 392 457 /* 393 int 458 * vm_map_wakeup: 394 _vm_map_trylock(vm_map_t map, const char *file, int line) 459 */ 395 { 460 void 396 int error; 461 vm_map_wakeup(vm_map_t map) 397 462 { 398 error = map->system_map ? 463 399 !_mtx_trylock(&map->system_mtx, 0, file, line) : 464 /* 400 lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, NULL, curthread); 465 * Acquire and release map_sleep_mtx to prevent a wakeup() 401 if (error == 0) 466 * from being performed (and lost) between the vm_map_unlock() 402 map->timestamp++; 467 * and the msleep() in vm_map_unlock_and_wait(). 403 return (error == 0); 468 */ 404 } 469 mtx_lock(&map_sleep_mtx); 405 470 mtx_unlock(&map_sleep_mtx); 406 int 471 wakeup(&map->root); 407 _vm_map_trylock_read(vm_map_t map, const char *file, int line) 472 } 408 { 473 409 int error; 474 long 410 475 vmspace_resident_count(struct vmspace *vmspace) 411 error = map->system_map ? 476 { 412 !_mtx_trylock(&map->system_mtx, 0, file, line) : 477 return pmap_resident_count(vmspace_pmap(vmspace)); 413 lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, NULL, curthread); 478 } 414 return (error == 0); 479 415 } 480 long 416 481 vmspace_wired_count(struct vmspace *vmspace) 417 int 482 { 418 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line) 483 return pmap_wired_count(vmspace_pmap(vmspace)); 419 { 484 } 420 485 421 if (map->system_map) { 486 /* 422 #ifdef INVARIANTS 487 * vm_map_create: 423 _mtx_assert(&map->system_mtx, MA_OWNED, file, line); 488 * 424 #endif 489 * Creates and returns a new empty VM map with 425 } else 490 * the given physical map structure, and having 426 KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE, 491 * the given lower and upper address bounds. 427 ("%s: lock not held", __func__)); 492 */ 428 map->timestamp++; 493 vm_map_t 429 return (0); 494 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max) 430 } 495 { 431 496 vm_map_t result; 432 void 497 433 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line) 498 result = uma_zalloc(mapzone, M_WAITOK); 434 { 499 CTR1(KTR_VM, "vm_map_create: %p", result); 435 500 _vm_map_init(result, min, max); 436 if (map->system_map) { 501 result->pmap = pmap; 437 #ifdef INVARIANTS 502 return (result); 438 _mtx_assert(&map->system_mtx, MA_OWNED, file, line); 503 } 439 #endif 504 440 } else 505 /* 441 KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE, 506 * Initialize an existing vm_map structure 442 ("%s: lock not held", __func__)); 507 * such as that in the vmspace structure. 443 } 508 * The pmap is set elsewhere. 444 509 */ 445 /* 510 static void 446 * vm_map_unlock_and_wait: 511 _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max) 447 */ 512 { 448 int 513 449 vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait) 514 map->header.next = map->header.prev = &map->header; 450 { 515 map->needs_wakeup = FALSE; 451 516 map->system_map = 0; 452 mtx_lock(&map_sleep_mtx); 517 map->min_offset = min; 453 vm_map_unlock(map); 518 map->max_offset = max; 454 return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 0)); 519 map->first_free = &map->header; 11/19/03 10:48:45 sys/vm/vm_map.c 5 520 map->root = NULL; 585 vm_map_entry_splay(vm_offset_t address, vm_map_entry_t root) 521 map->timestamp = 0; 586 { 522 } 587 struct vm_map_entry dummy; 523 588 vm_map_entry_t lefttreemax, righttreemin, y; 524 void 589 525 vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max) 590 if (root == NULL) 526 { 591 return (root); 527 _vm_map_init(map, min, max); 592 lefttreemax = righttreemin = &dummy; 528 mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); 593 for (;; root = y) { 529 lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE); 594 if (address < root->start) { 530 } 595 if ((y = root->left) == NULL) 531 596 break; 532 /* 597 if (address < y->start) { 533 * vm_map_entry_dispose: [ internal use only ] 598 /* Rotate right. */ 534 * 599 root->left = y->right; 535 * Inverse of vm_map_entry_create. 600 y->right = root; 536 */ 601 root = y; 537 static void 602 if ((y = root->left) == NULL) 538 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) 603 break; 539 { 604 } 540 uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); 605 /* Link into the new root’s right tree. */ 541 } 606 righttreemin->left = root; 542 607 righttreemin = root; 543 /* 608 } else if (address >= root->end) { 544 * vm_map_entry_create: [ internal use only ] 609 if ((y = root->right) == NULL) 545 * 610 break; 546 * Allocates a VM map entry for insertion. 611 if (address >= y->end) { 547 * No entry fields are filled in. 612 /* Rotate left. */ 548 */ 613 root->right = y->left; 549 static vm_map_entry_t 614 y->left = root; 550 vm_map_entry_create(vm_map_t map) 615 root = y; 551 { 616 if ((y = root->right) == NULL) 552 vm_map_entry_t new_entry; 617 break; 553 618 } 554 if (map->system_map) 619 /* Link into the new root’s left tree. */ 555 new_entry = uma_zalloc(kmapentzone, M_NOWAIT); 620 lefttreemax->right = root; 556 else 621 lefttreemax = root; 557 new_entry = uma_zalloc(mapentzone, M_WAITOK); 622 } else 558 if (new_entry == NULL) 623 break; 559 panic("vm_map_entry_create: kernel resources exhausted"); 624 } 560 return (new_entry); 625 /* Assemble the new root. */ 561 } 626 lefttreemax->right = root->left; 562 627 righttreemin->left = root->right; 563 /* 628 root->left = dummy.right; 564 * vm_map_entry_set_behavior: 629 root->right = dummy.left; 565 * 630 return (root); 566 * Set the expected access behavior, either normal, random, or 631 } 567 * sequential. 632 568 */ 633 /* 569 static __inline void 634 * vm_map_entry_{un,}link: 570 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) 635 * 571 { 636 * Insert/remove entries from maps. 572 entry->eflags = (entry->eflags & ˜MAP_ENTRY_BEHAV_MASK) | 637 */ 573 (behavior & MAP_ENTRY_BEHAV_MASK); 638 static void 574 } 639 vm_map_entry_link(vm_map_t map, 575 640 vm_map_entry_t after_where, 576 /* 641 vm_map_entry_t entry) 577 * vm_map_entry_splay: 642 { 578 * 643 579 * Implements Sleator and Tarjan’s top-down splay algorithm. Returns 644 CTR4(KTR_VM, 580 * the vm_map_entry containing the given address. If, however, that 645 "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map, 581 * address is not found in the vm_map, returns a vm_map_entry that is 646 map->nentries, entry, after_where); 582 * adjacent to the address, coming before or after it. 647 map->nentries++; 583 */ 648 entry->prev = after_where; 584 static vm_map_entry_t 649 entry->next = after_where->next; 11/19/03 10:48:45 sys/vm/vm_map.c 6 650 entry->next->prev = entry; 714 651 after_where->next = entry; 715 if (address >= cur->start) { 652 716 *entry = cur; 653 if (after_where != &map->header) { 717 if (cur->end > address) 654 if (after_where != map->root) 718 return (TRUE); 655 vm_map_entry_splay(after_where->start, map->root); 719 } else 656 entry->right = after_where->right; 720 *entry = cur->prev; 657 entry->left = after_where; 721 } 658 after_where->right = NULL; 722 return (FALSE); 659 } else { 723 } 660 entry->right = map->root; 724 661 entry->left = NULL; 725 /* 662 } 726 * vm_map_insert: 663 map->root = entry; 727 * 664 } 728 * Inserts the given whole VM object into the target 665 729 * map at the specified address range. The object’s 666 static void 730 * size should match that of the address range. 667 vm_map_entry_unlink(vm_map_t map, 731 * 668 vm_map_entry_t entry) 732 * Requires that the map be locked, and leaves it so. 669 { 733 * 670 vm_map_entry_t next, prev, root; 734 * If object is non-NULL, ref count must be bumped by caller 671 735 * prior to making call to account for the new entry. 672 if (entry != map->root) 736 */ 673 vm_map_entry_splay(entry->start, map->root); 737 int 674 if (entry->left == NULL) 738 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 675 root = entry->right; 739 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t ma 676 else { x, 677 root = vm_map_entry_splay(entry->start, entry->left); 740 int cow) 678 root->right = entry->right; 741 { 679 } 742 vm_map_entry_t new_entry; 680 map->root = root; 743 vm_map_entry_t prev_entry; 681 744 vm_map_entry_t temp_entry; 682 prev = entry->prev; 745 vm_eflags_t protoeflags; 683 next = entry->next; 746 684 next->prev = prev; 747 /* 685 prev->next = next; 748 * Check that the start and end points are not bogus. 686 map->nentries--; 749 */ 687 CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map 750 if ((start < map->min_offset) || (end > map->max_offset) || , 751 (start >= end)) 688 map->nentries, entry); 752 return (KERN_INVALID_ADDRESS); 689 } 753 690 754 /* 691 /* 755 * Find the entry prior to the proposed starting address; if it’s part 692 * vm_map_lookup_entry: [ internal use only ] 756 * of an existing entry, this range is bogus. 693 * 757 */ 694 * Finds the map entry containing (or 758 if (vm_map_lookup_entry(map, start, &temp_entry)) 695 * immediately preceding) the specified address 759 return (KERN_NO_SPACE); 696 * in the given map; the entry is returned 760 697 * in the "entry" parameter. The boolean 761 prev_entry = temp_entry; 698 * result indicates whether the address is 762 699 * actually contained in the map. 763 /* 700 */ 764 * Assert that the next entry doesn’t overlap the end point. 701 boolean_t 765 */ 702 vm_map_lookup_entry( 766 if ((prev_entry->next != &map->header) && 703 vm_map_t map, 767 (prev_entry->next->start < end)) 704 vm_offset_t address, 768 return (KERN_NO_SPACE); 705 vm_map_entry_t *entry) /* OUT */ 769 706 { 770 protoeflags = 0; 707 vm_map_entry_t cur; 771 708 772 if (cow & MAP_COPY_ON_WRITE) 709 cur = vm_map_entry_splay(address, map->root); 773 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY; 710 if (cur == NULL) 774 711 *entry = &map->header; 775 if (cow & MAP_NOFAULT) { 712 else { 776 protoeflags |= MAP_ENTRY_NOFAULT; 713 map->root = cur; 777 11/19/03 10:48:45 sys/vm/vm_map.c 7 778 KASSERT(object == NULL, 842 */ 779 ("vm_map_insert: paradoxical MAP_NOFAULT request")); 843 new_entry = vm_map_entry_create(map); 780 } 844 new_entry->start = start; 781 if (cow & MAP_DISABLE_SYNCER) 845 new_entry->end = end; 782 protoeflags |= MAP_ENTRY_NOSYNC; 846 783 if (cow & MAP_DISABLE_COREDUMP) 847 new_entry->eflags = protoeflags; 784 protoeflags |= MAP_ENTRY_NOCOREDUMP; 848 new_entry->object.vm_object = object; 785 849 new_entry->offset = offset; 786 if (object != NULL) { 850 new_entry->avail_ssize = 0; 787 /* 851 788 * OBJ_ONEMAPPING must be cleared unless this mapping 852 new_entry->inheritance = VM_INHERIT_DEFAULT; 789 * is trivially proven to be the only mapping for any 853 new_entry->protection = prot; 790 * of the object’s pages. (Object granularity 854 new_entry->max_protection = max; 791 * reference counting is insufficient to recognize 855 new_entry->wired_count = 0; 792 * aliases with precision.) 856 793 */ 857 /* 794 VM_OBJECT_LOCK(object); 858 * Insert the new entry into the list 795 if (object->ref_count > 1 || object->shadow_count != 0) 859 */ 796 vm_object_clear_flag(object, OBJ_ONEMAPPING); 860 vm_map_entry_link(map, prev_entry, new_entry); 797 VM_OBJECT_UNLOCK(object); 861 map->size += new_entry->end - new_entry->start; 798 } 862 799 else if ((prev_entry != &map->header) && 863 /* 800 (prev_entry->eflags == protoeflags) && 864 * Update the free space hint 801 (prev_entry->end == start) && 865 */ 802 (prev_entry->wired_count == 0) && 866 if ((map->first_free == prev_entry) && 803 ((prev_entry->object.vm_object == NULL) || 867 (prev_entry->end >= new_entry->start)) { 804 vm_object_coalesce(prev_entry->object.vm_object, 868 map->first_free = new_entry; 805 OFF_TO_IDX(prev_entry->offset), 869 } 806 (vm_size_t)(prev_entry->end - prev_entry- 870 >start), 871 #if 0 807 (vm_size_t)(end - prev_entry->end)))) { 872 /* 808 /* 873 * Temporarily removed to avoid MAP_STACK panic, due to 809 * We were able to extend the object. Determine if we 874 * MAP_STACK being a huge hack. Will be added back in 810 * can extend the previous map entry to include the 875 * when MAP_STACK (and the user stack mapping) is fixed. 811 * new range as well. 876 */ 812 */ 877 /* 813 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) && 878 * It may be possible to simplify the entry 814 (prev_entry->protection == prot) && 879 */ 815 (prev_entry->max_protection == max)) { 880 vm_map_simplify_entry(map, new_entry); 816 map->size += (end - prev_entry->end); 881 #endif 817 prev_entry->end = end; 882 818 vm_map_simplify_entry(map, prev_entry); 883 if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) { 819 return (KERN_SUCCESS); 884 vm_map_pmap_enter(map, start, 820 } 885 object, OFF_TO_IDX(offset), end - start, 821 886 cow & MAP_PREFAULT_PARTIAL); 822 /* 887 } 823 * If we can extend the object but cannot extend the 888 824 * map entry, we have to create a new map entry. We 889 return (KERN_SUCCESS); 825 * must bump the ref count on the extended object to 890 } 826 * account for it. object may be NULL. 891 827 */ 892 /* 828 object = prev_entry->object.vm_object; 893 * Find sufficient space for ‘length’ bytes in the given map, starting at 829 offset = prev_entry->offset + 894 * ‘start’. The map must be locked. Returns 0 on success, 1 on no space. 830 (prev_entry->end - prev_entry->start); 895 */ 831 vm_object_reference(object); 896 int 832 } 897 vm_map_findspace( 833 898 vm_map_t map, 834 /* 899 vm_offset_t start, 835 * NOTE: if conditionals fail, object can be NULL here. This occurs 900 vm_size_t length, 836 * in things like the buffer map where we manage kva but do not manage 901 vm_offset_t *addr) 837 * backing objects. 902 { 838 */ 903 vm_map_entry_t entry, next; 839 904 vm_offset_t end; 840 /* 905 841 * Create a new entry 906 if (start < map->min_offset) 11/19/03 10:48:45 sys/vm/vm_map.c 8 907 start = map->min_offset; 972 908 if (start > map->max_offset) 973 start = *addr; 909 return (1); 974 910 975 if (map == kmem_map) 911 /* 976 s = splvm(); 912 * Look for the first possible address; if there’s already something 977 913 * at this address, we have to start after it. 978 vm_map_lock(map); 914 */ 979 if (find_space) { 915 if (start == map->min_offset) { 980 if (vm_map_findspace(map, start, length, addr)) { 916 if ((entry = map->first_free) != &map->header) 981 vm_map_unlock(map); 917 start = entry->end; 982 if (map == kmem_map) 918 } else { 983 splx(s); 919 vm_map_entry_t tmp; 984 return (KERN_NO_SPACE); 920 985 } 921 if (vm_map_lookup_entry(map, start, &tmp)) 986 start = *addr; 922 start = tmp->end; 987 } 923 entry = tmp; 988 result = vm_map_insert(map, object, offset, 924 } 989 start, start + length, prot, max, cow); 925 990 vm_map_unlock(map); 926 /* 991 927 * Look through the rest of the map, trying to fit a new region in the 992 if (map == kmem_map) 928 * gap between existing regions, or after the very last region. 993 splx(s); 929 */ 994 930 for (;; start = (entry = next)->end) { 995 return (result); 931 /* 996 } 932 * Find the end of the proposed new region. Be sure we didn’t 997 933 * go beyond the end of the map, or wrap around the address; 998 /* 934 * if so, we lose. Otherwise, if this is the last entry, or 999 * vm_map_simplify_entry: 935 * if the proposed new region fits before the next entry, we 1000 * 936 * win. 1001 * Simplify the given map entry by merging with either neighbor. This 937 */ 1002 * routine also has the ability to merge with both neighbors. 938 end = start + length; 1003 * 939 if (end > map->max_offset || end < start) 1004 * The map must be locked. 940 return (1); 1005 * 941 next = entry->next; 1006 * This routine guarentees that the passed entry remains valid (though 942 if (next == &map->header || next->start >= end) 1007 * possibly extended). When merging, this routine may delete one or 943 break; 1008 * both neighbors. 944 } 1009 */ 945 *addr = start; 1010 void 946 if (map == kernel_map) { 1011 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) 947 vm_offset_t ksize; 1012 { 948 if ((ksize = round_page(start + length)) > kernel_vm_end) { 1013 vm_map_entry_t next, prev; 949 pmap_growkernel(ksize); 1014 vm_size_t prevsize, esize; 950 } 1015 951 } 1016 if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) 952 return (0); 1017 return; 953 } 1018 954 1019 prev = entry->prev; 955 /* 1020 if (prev != &map->header) { 956 * vm_map_find finds an unallocated region in the target address 1021 prevsize = prev->end - prev->start; 957 * map with the given length. The search is defined to be 1022 if ( (prev->end == entry->start) && 958 * first-fit from the specified address; the region found is 1023 (prev->object.vm_object == entry->object.vm_object) && 959 * returned in the same parameter. 1024 (!prev->object.vm_object || 960 * 1025 (prev->offset + prevsize == entry->offset)) && 961 * If object is non-NULL, ref count must be bumped by caller 1026 (prev->eflags == entry->eflags) && 962 * prior to making call to account for the new entry. 1027 (prev->protection == entry->protection) && 963 */ 1028 (prev->max_protection == entry->max_protection) && 964 int 1029 (prev->inheritance == entry->inheritance) && 965 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1030 (prev->wired_count == entry->wired_count)) { 966 vm_offset_t *addr, /* IN/OUT */ 1031 if (map->first_free == prev) 967 vm_size_t length, boolean_t find_space, vm_prot_t prot, 1032 map->first_free = entry; 968 vm_prot_t max, int cow) 1033 vm_map_entry_unlink(map, prev); 969 { 1034 entry->start = prev->start; 970 vm_offset_t start; 1035 entry->offset = prev->offset; 971 int result, s = 0; 1036 if (prev->object.vm_object) 11/19/03 10:48:45 sys/vm/vm_map.c 9 1037 vm_object_deallocate(prev->object.vm_object); 1102 object = vm_object_allocate(OBJT_DEFAULT, 1038 vm_map_entry_dispose(map, prev); 1103 atop(entry->end - entry->start)); 1039 } 1104 entry->object.vm_object = object; 1040 } 1105 entry->offset = 0; 1041 1106 } 1042 next = entry->next; 1107 1043 if (next != &map->header) { 1108 new_entry = vm_map_entry_create(map); 1044 esize = entry->end - entry->start; 1109 *new_entry = *entry; 1045 if ((entry->end == next->start) && 1110 1046 (next->object.vm_object == entry->object.vm_object) && 1111 new_entry->end = start; 1047 (!entry->object.vm_object || 1112 entry->offset += (start - entry->start); 1048 (entry->offset + esize == next->offset)) && 1113 entry->start = start; 1049 (next->eflags == entry->eflags) && 1114 1050 (next->protection == entry->protection) && 1115 vm_map_entry_link(map, entry->prev, new_entry); 1051 (next->max_protection == entry->max_protection) && 1116 1052 (next->inheritance == entry->inheritance) && 1117 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 1053 (next->wired_count == entry->wired_count)) { 1118 vm_object_reference(new_entry->object.vm_object); 1054 if (map->first_free == next) 1119 } 1055 map->first_free = entry; 1120 } 1056 vm_map_entry_unlink(map, next); 1121 1057 entry->end = next->end; 1122 /* 1058 if (next->object.vm_object) 1123 * vm_map_clip_end: [ internal use only ] 1059 vm_object_deallocate(next->object.vm_object); 1124 * 1060 vm_map_entry_dispose(map, next); 1125 * Asserts that the given entry ends at or before 1061 } 1126 * the specified address; if necessary, 1062 } 1127 * it splits the entry into two. 1063 } 1128 */ 1064 /* 1129 #define vm_map_clip_end(map, entry, endaddr) \ 1065 * vm_map_clip_start: [ internal use only ] 1130 { \ 1066 * 1131 if ((endaddr) < (entry->end)) \ 1067 * Asserts that the given entry begins at or after 1132 _vm_map_clip_end((map), (entry), (endaddr)); \ 1068 * the specified address; if necessary, 1133 } 1069 * it splits the entry into two. 1134 1070 */ 1135 /* 1071 #define vm_map_clip_start(map, entry, startaddr) \ 1136 * This routine is called only when it is known that 1072 { \ 1137 * the entry must be split. 1073 if (startaddr > entry->start) \ 1138 */ 1074 _vm_map_clip_start(map, entry, startaddr); \ 1139 static void 1075 } 1140 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) 1076 1141 { 1077 /* 1142 vm_map_entry_t new_entry; 1078 * This routine is called only when it is known that 1143 1079 * the entry must be split. 1144 /* 1080 */ 1145 * If there is no object backing this entry, we might as well create 1081 static void 1146 * one now. If we defer it, an object can get created after the map 1082 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) 1147 * is clipped, and individual objects will be created for the split-up 1083 { 1148 * map. This is a bit of a hack, but is also about the best place to 1084 vm_map_entry_t new_entry; 1149 * put this improvement. 1085 1150 */ 1086 /* 1151 if (entry->object.vm_object == NULL && !map->system_map) { 1087 * Split off the front portion -- note that we must insert the new 1152 vm_object_t object; 1088 * entry BEFORE this one, so that this entry has the specified 1153 object = vm_object_allocate(OBJT_DEFAULT, 1089 * starting address. 1154 atop(entry->end - entry->start)); 1090 */ 1155 entry->object.vm_object = object; 1091 vm_map_simplify_entry(map, entry); 1156 entry->offset = 0; 1092 1157 } 1093 /* 1158 1094 * If there is no object backing this entry, we might as well create 1159 /* 1095 * one now. If we defer it, an object can get created after the map 1160 * Create a new entry and insert it AFTER the specified entry 1096 * is clipped, and individual objects will be created for the split-up 1161 */ 1097 * map. This is a bit of a hack, but is also about the best place to 1162 new_entry = vm_map_entry_create(map); 1098 * put this improvement. 1163 *new_entry = *entry; 1099 */ 1164 1100 if (entry->object.vm_object == NULL && !map->system_map) { 1165 new_entry->start = entry->end = end; 1101 vm_object_t object; 1166 new_entry->offset += (end - entry->start); 11/19/03 10:48:45 sys/vm/vm_map.c 10 1167 1232 (entry->object.vm_object == NULL)) { 1168 vm_map_entry_link(map, entry, new_entry); 1233 entry->object.sub_map = submap; 1169 1234 entry->eflags |= MAP_ENTRY_IS_SUB_MAP; 1170 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 1235 result = KERN_SUCCESS; 1171 vm_object_reference(new_entry->object.vm_object); 1236 } 1172 } 1237 vm_map_unlock(map); 1173 } 1238 1174 1239 return (result); 1175 /* 1240 } 1176 * VM_MAP_RANGE_CHECK: [ internal use only ] 1241 1177 * 1242 /* 1178 * Asserts that the starting and ending region 1243 * The maximum number of pages to map 1179 * addresses fall within the valid range of the map. 1244 */ 1180 */ 1245 #define MAX_INIT_PT 96 1181 #define VM_MAP_RANGE_CHECK(map, start, end) \ 1246 1182 { \ 1247 /* 1183 if (start < vm_map_min(map)) \ 1248 * vm_map_pmap_enter: 1184 start = vm_map_min(map); \ 1249 * 1185 if (end > vm_map_max(map)) \ 1250 * Preload the mappings for the given object into the specified 1186 end = vm_map_max(map); \ 1251 * map. This eliminates the soft faults on process startup and 1187 if (start > end) \ 1252 * immediately after an mmap(2). 1188 start = end; \ 1253 */ 1189 } 1254 void 1190 1255 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, 1191 /* 1256 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) 1192 * vm_map_submap: [ kernel use only ] 1257 { 1193 * 1258 vm_offset_t tmpidx; 1194 * Mark the given range as handled by a subordinate map. 1259 int psize; 1195 * 1260 vm_page_t p, mpte; 1196 * This range must have been created with vm_map_find, 1261 1197 * and no other operations may have been performed on this 1262 if (object == NULL) 1198 * range prior to calling vm_map_submap. 1263 return; 1199 * 1264 mtx_lock(&Giant); 1200 * Only a limited number of operations can be performed 1265 VM_OBJECT_LOCK(object); 1201 * within this rage after calling vm_map_submap: 1266 if (object->type == OBJT_DEVICE) { 1202 * vm_fault 1267 pmap_object_init_pt(map->pmap, addr, object, pindex, size); 1203 * [Don’t try vm_map_copy!] 1268 goto unlock_return; 1204 * 1269 } 1205 * To remove a submapping, one must first remove the 1270 1206 * range from the superior map, and then destroy the 1271 psize = atop(size); 1207 * submap (if desired). [Better yet, don’t try it.] 1272 1208 */ 1273 if (object->type != OBJT_VNODE || 1209 int 1274 ((flags & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 1210 vm_map_submap( 1275 (object->resident_page_count > MAX_INIT_PT))) { 1211 vm_map_t map, 1276 goto unlock_return; 1212 vm_offset_t start, 1277 } 1213 vm_offset_t end, 1278 1214 vm_map_t submap) 1279 if (psize + pindex > object->size) { 1215 { 1280 if (object->size < pindex) 1216 vm_map_entry_t entry; 1281 goto unlock_return; 1217 int result = KERN_INVALID_ARGUMENT; 1282 psize = object->size - pindex; 1218 1283 } 1219 vm_map_lock(map); 1284 1220 1285 mpte = NULL; 1221 VM_MAP_RANGE_CHECK(map, start, end); 1286 1222 1287 if ((p = TAILQ_FIRST(&object->memq)) != NULL) { 1223 if (vm_map_lookup_entry(map, start, &entry)) { 1288 if (p->pindex < pindex) { 1224 vm_map_clip_start(map, entry, start); 1289 p = vm_page_splay(pindex, object->root); 1225 } else 1290 if ((object->root = p)->pindex < pindex) 1226 entry = entry->next; 1291 p = TAILQ_NEXT(p, listq); 1227 1292 } 1228 vm_map_clip_end(map, entry, end); 1293 } 1229 1294 /* 1230 if ((entry->start == start) && (entry->end == end) && 1295 * Assert: the variable p is either (1) the page with the 1231 ((entry->eflags & MAP_ENTRY_COW) == 0) && 1296 * least pindex greater than or equal to the parameter pindex 11/19/03 10:48:45 sys/vm/vm_map.c 11 1297 * or (2) NULL. 1362 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { 1298 */ 1363 vm_map_unlock(map); 1299 for (; 1364 return (KERN_INVALID_ARGUMENT); 1300 p != NULL && (tmpidx = p->pindex - pindex) < psize; 1365 } 1301 p = TAILQ_NEXT(p, listq)) { 1366 if ((new_prot & current->max_protection) != new_prot) { 1302 /* 1367 vm_map_unlock(map); 1303 * don’t allow an madvise to blow away our really 1368 return (KERN_PROTECTION_FAILURE); 1304 * free pages allocating pv entries. 1369 } 1305 */ 1370 current = current->next; 1306 if ((flags & MAP_PREFAULT_MADVISE) && 1371 } 1307 cnt.v_free_count < cnt.v_free_reserved) { 1372 1308 break; 1373 /* 1309 } 1374 * Go back and fix up protections. [Note that clipping is not 1310 vm_page_lock_queues(); 1375 * necessary the second time.] 1311 if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL && 1376 */ 1312 (p->busy == 0) && 1377 current = entry; 1313 (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { 1378 while ((current != &map->header) && (current->start < end)) { 1314 if ((p->queue - p->pc) == PQ_CACHE) 1379 vm_prot_t old_prot; 1315 vm_page_deactivate(p); 1380 1316 vm_page_busy(p); 1381 vm_map_clip_end(map, current, end); 1317 vm_page_unlock_queues(); 1382 1318 VM_OBJECT_UNLOCK(object); 1383 old_prot = current->protection; 1319 mpte = pmap_enter_quick(map->pmap, 1384 if (set_max) 1320 addr + ptoa(tmpidx), p, mpte); 1385 current->protection = 1321 VM_OBJECT_LOCK(object); 1386 (current->max_protection = new_prot) & 1322 vm_page_lock_queues(); 1387 old_prot; 1323 vm_page_wakeup(p); 1388 else 1324 } 1389 current->protection = new_prot; 1325 vm_page_unlock_queues(); 1390 1326 } 1391 /* 1327 unlock_return: 1392 * Update physical map if necessary. Worry about copy-on-write 1328 VM_OBJECT_UNLOCK(object); 1393 * here -- CHECK THIS XXX 1329 mtx_unlock(&Giant); 1394 */ 1330 } 1395 if (current->protection != old_prot) { 1331 1396 mtx_lock(&Giant); 1332 /* 1397 vm_page_lock_queues(); 1333 * vm_map_protect: 1398 #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ˜VM_PROT_WRITE : 1334 * \ 1335 * Sets the protection of the specified address 1399 VM_PROT_ALL) 1336 * region in the target map. If "set_max" is 1400 pmap_protect(map->pmap, current->start, 1337 * specified, the maximum protection is to be set; 1401 current->end, 1338 * otherwise, only the current protection is affected. 1402 current->protection & MASK(current)); 1339 */ 1403 #undef MASK 1340 int 1404 vm_page_unlock_queues(); 1341 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, 1405 mtx_unlock(&Giant); 1342 vm_prot_t new_prot, boolean_t set_max) 1406 } 1343 { 1407 vm_map_simplify_entry(map, current); 1344 vm_map_entry_t current; 1408 current = current->next; 1345 vm_map_entry_t entry; 1409 } 1346 1410 vm_map_unlock(map); 1347 vm_map_lock(map); 1411 return (KERN_SUCCESS); 1348 1412 } 1349 VM_MAP_RANGE_CHECK(map, start, end); 1413 1350 1414 /* 1351 if (vm_map_lookup_entry(map, start, &entry)) { 1415 * vm_map_madvise: 1352 vm_map_clip_start(map, entry, start); 1416 * 1353 } else { 1417 * This routine traverses a processes map handling the madvise 1354 entry = entry->next; 1418 * system call. Advisories are classified as either those effecting 1355 } 1419 * the vm_map_entry structure, or those effecting the underlying 1356 1420 * objects. 1357 /* 1421 */ 1358 * Make a first pass to check for protection violations. 1422 int 1359 */ 1423 vm_map_madvise( 1360 current = entry; 1424 vm_map_t map, 1361 while ((current != &map->header) && (current->start < end)) { 1425 vm_offset_t start, 11/19/03 10:48:45 sys/vm/vm_map.c 12 1426 vm_offset_t end, 1490 case MADV_SEQUENTIAL: 1427 int behav) 1491 vm_map_entry_set_behavior(current, MAP_ENTRY_B 1428 { EHAV_SEQUENTIAL); 1429 vm_map_entry_t current, entry; 1492 break; 1430 int modify_map = 0; 1493 case MADV_RANDOM: 1431 1494 vm_map_entry_set_behavior(current, MAP_ENTRY_B 1432 /* EHAV_RANDOM); 1433 * Some madvise calls directly modify the vm_map_entry, in which case 1495 break; 1434 * we need to use an exclusive lock on the map and we need to perform 1496 case MADV_NOSYNC: 1435 * various clipping operations. Otherwise we only need a read-lock 1497 current->eflags |= MAP_ENTRY_NOSYNC; 1436 * on the map. 1498 break; 1437 */ 1499 case MADV_AUTOSYNC: 1438 switch(behav) { 1500 current->eflags &= ˜MAP_ENTRY_NOSYNC; 1439 case MADV_NORMAL: 1501 break; 1440 case MADV_SEQUENTIAL: 1502 case MADV_NOCORE: 1441 case MADV_RANDOM: 1503 current->eflags |= MAP_ENTRY_NOCOREDUMP; 1442 case MADV_NOSYNC: 1504 break; 1443 case MADV_AUTOSYNC: 1505 case MADV_CORE: 1444 case MADV_NOCORE: 1506 current->eflags &= ˜MAP_ENTRY_NOCOREDUMP; 1445 case MADV_CORE: 1507 break; 1446 modify_map = 1; 1508 default: 1447 vm_map_lock(map); 1509 break; 1448 break; 1510 } 1449 case MADV_WILLNEED: 1511 vm_map_simplify_entry(map, current); 1450 case MADV_DONTNEED: 1512 } 1451 case MADV_FREE: 1513 vm_map_unlock(map); 1452 vm_map_lock_read(map); 1514 } else { 1453 break; 1515 vm_pindex_t pindex; 1454 default: 1516 int count; 1455 return (KERN_INVALID_ARGUMENT); 1517 1456 } 1518 /* 1457 1519 * madvise behaviors that are implemented in the underlying 1458 /* 1520 * vm_object. 1459 * Locate starting entry and clip if necessary. 1521 * 1460 */ 1522 * Since we don’t clip the vm_map_entry, we have to clip 1461 VM_MAP_RANGE_CHECK(map, start, end); 1523 * the vm_object pindex and count. 1462 1524 */ 1463 if (vm_map_lookup_entry(map, start, &entry)) { 1525 for (current = entry; 1464 if (modify_map) 1526 (current != &map->header) && (current->start < end); 1465 vm_map_clip_start(map, entry, start); 1527 current = current->next 1466 } else { 1528 ) { 1467 entry = entry->next; 1529 vm_offset_t useStart; 1468 } 1530 1469 1531 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) 1470 if (modify_map) { 1532 continue; 1471 /* 1533 1472 * madvise behaviors that are implemented in the vm_map_entry. 1534 pindex = OFF_TO_IDX(current->offset); 1473 * 1535 count = atop(current->end - current->start); 1474 * We clip the vm_map_entry so that behavioral changes are 1536 useStart = current->start; 1475 * limited to the specified address range. 1537 1476 */ 1538 if (current->start < start) { 1477 for (current = entry; 1539 pindex += atop(start - current->start); 1478 (current != &map->header) && (current->start < end); 1540 count -= atop(start - current->start); 1479 current = current->next 1541 useStart = start; 1480 ) { 1542 } 1481 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) 1543 if (current->end > end) 1482 continue; 1544 count -= atop(current->end - end); 1483 1545 1484 vm_map_clip_end(map, current, end); 1546 if (count <= 0) 1485 1547 continue; 1486 switch (behav) { 1548 1487 case MADV_NORMAL: 1549 vm_object_madvise(current->object.vm_object, 1488 vm_map_entry_set_behavior(current, MAP_ENTRY_B 1550 pindex, count, behav); EHAV_NORMAL); 1551 if (behav == MADV_WILLNEED) { 1489 break; 1552 vm_map_pmap_enter(map, 11/19/03 10:48:45 sys/vm/vm_map.c 13 1553 useStart, 1618 unsigned int last_timestamp; 1554 current->object.vm_object, 1619 int rv; 1555 pindex, 1620 boolean_t need_wakeup, result, user_unwire; 1556 (count << PAGE_SHIFT), 1621 1557 MAP_PREFAULT_MADVISE 1622 user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; 1558 ); 1623 vm_map_lock(map); 1559 } 1624 VM_MAP_RANGE_CHECK(map, start, end); 1560 } 1625 if (!vm_map_lookup_entry(map, start, &first_entry)) { 1561 vm_map_unlock_read(map); 1626 if (flags & VM_MAP_WIRE_HOLESOK) 1562 } 1627 first_entry = first_entry->next; 1563 return (0); 1628 else { 1564 } 1629 vm_map_unlock(map); 1565 1630 return (KERN_INVALID_ADDRESS); 1566 1631 } 1567 /* 1632 } 1568 * vm_map_inherit: 1633 last_timestamp = map->timestamp; 1569 * 1634 entry = first_entry; 1570 * Sets the inheritance of the specified address 1635 while (entry != &map->header && entry->start < end) { 1571 * range in the target map. Inheritance 1636 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 1572 * affects how the map will be shared with 1637 /* 1573 * child maps at the time of vm_map_fork. 1638 * We have not yet clipped the entry. 1574 */ 1639 */ 1575 int 1640 saved_start = (start >= entry->start) ? start : 1576 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, 1641 entry->start; 1577 vm_inherit_t new_inheritance) 1642 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 1578 { 1643 if (vm_map_unlock_and_wait(map, user_unwire)) { 1579 vm_map_entry_t entry; 1644 /* 1580 vm_map_entry_t temp_entry; 1645 * Allow interruption of user unwiring? 1581 1646 */ 1582 switch (new_inheritance) { 1647 } 1583 case VM_INHERIT_NONE: 1648 vm_map_lock(map); 1584 case VM_INHERIT_COPY: 1649 if (last_timestamp+1 != map->timestamp) { 1585 case VM_INHERIT_SHARE: 1650 /* 1586 break; 1651 * Look again for the entry because the map wa 1587 default: s 1588 return (KERN_INVALID_ARGUMENT); 1652 * modified while it was unlocked. 1589 } 1653 * Specifically, the entry may have been 1590 vm_map_lock(map); 1654 * clipped, merged, or deleted. 1591 VM_MAP_RANGE_CHECK(map, start, end); 1655 */ 1592 if (vm_map_lookup_entry(map, start, &temp_entry)) { 1656 if (!vm_map_lookup_entry(map, saved_start, 1593 entry = temp_entry; 1657 &tmp_entry)) { 1594 vm_map_clip_start(map, entry, start); 1658 if (flags & VM_MAP_WIRE_HOLESOK) 1595 } else 1659 tmp_entry = tmp_entry->next; 1596 entry = temp_entry->next; 1660 else { 1597 while ((entry != &map->header) && (entry->start < end)) { 1661 if (saved_start == start) { 1598 vm_map_clip_end(map, entry, end); 1662 /* 1599 entry->inheritance = new_inheritance; 1663 * First_entry has bee 1600 vm_map_simplify_entry(map, entry); n deleted. 1601 entry = entry->next; 1664 */ 1602 } 1665 vm_map_unlock(map); 1603 vm_map_unlock(map); 1666 return (KERN_INVALID_A 1604 return (KERN_SUCCESS); DDRESS); 1605 } 1667 } 1606 1668 end = saved_start; 1607 /* 1669 rv = KERN_INVALID_ADDRESS; 1608 * vm_map_unwire: 1670 goto done; 1609 * 1671 } 1610 * Implements both kernel and user unwiring. 1672 } 1611 */ 1673 if (entry == first_entry) 1612 int 1674 first_entry = tmp_entry; 1613 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, 1675 else 1614 int flags) 1676 first_entry = NULL; 1615 { 1677 entry = tmp_entry; 1616 vm_map_entry_t entry, first_entry, tmp_entry; 1678 } 1617 vm_offset_t saved_start; 1679 last_timestamp = map->timestamp; 11/19/03 10:48:45 sys/vm/vm_map.c 14 1680 continue; 1744 vm_map_unlock(map); 1681 } 1745 if (need_wakeup) 1682 vm_map_clip_start(map, entry, start); 1746 vm_map_wakeup(map); 1683 vm_map_clip_end(map, entry, end); 1747 return (rv); 1684 /* 1748 } 1685 * Mark the entry in case the map lock is released. (See 1749 1686 * above.) 1750 /* 1687 */ 1751 * vm_map_wire: 1688 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 1752 * 1689 /* 1753 * Implements both kernel and user wiring. 1690 * Check the map for holes in the specified region. 1754 */ 1691 * If VM_MAP_WIRE_HOLESOK was specified, skip this check. 1755 int 1692 */ 1756 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, 1693 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && 1757 int flags) 1694 (entry->end < end && (entry->next == &map->header || 1758 { 1695 entry->next->start > entry->end))) { 1759 vm_map_entry_t entry, first_entry, tmp_entry; 1696 end = entry->end; 1760 vm_offset_t saved_end, saved_start; 1697 rv = KERN_INVALID_ADDRESS; 1761 unsigned int last_timestamp; 1698 goto done; 1762 int rv; 1699 } 1763 boolean_t need_wakeup, result, user_wire; 1700 /* 1764 1701 * Require that the entry is wired. 1765 user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; 1702 */ 1766 vm_map_lock(map); 1703 if (entry->wired_count == 0 || (user_unwire && 1767 VM_MAP_RANGE_CHECK(map, start, end); 1704 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)) { 1768 if (!vm_map_lookup_entry(map, start, &first_entry)) { 1705 end = entry->end; 1769 if (flags & VM_MAP_WIRE_HOLESOK) 1706 rv = KERN_INVALID_ARGUMENT; 1770 first_entry = first_entry->next; 1707 goto done; 1771 else { 1708 } 1772 vm_map_unlock(map); 1709 entry = entry->next; 1773 return (KERN_INVALID_ADDRESS); 1710 } 1774 } 1711 rv = KERN_SUCCESS; 1775 } 1712 done: 1776 last_timestamp = map->timestamp; 1713 need_wakeup = FALSE; 1777 entry = first_entry; 1714 if (first_entry == NULL) { 1778 while (entry != &map->header && entry->start < end) { 1715 result = vm_map_lookup_entry(map, start, &first_entry); 1779 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 1716 if (!result && (flags & VM_MAP_WIRE_HOLESOK)) 1780 /* 1717 first_entry = first_entry->next; 1781 * We have not yet clipped the entry. 1718 else 1782 */ 1719 KASSERT(result, ("vm_map_unwire: lookup failed")); 1783 saved_start = (start >= entry->start) ? start : 1720 } 1784 entry->start; 1721 entry = first_entry; 1785 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 1722 while (entry != &map->header && entry->start < end) { 1786 if (vm_map_unlock_and_wait(map, user_wire)) { 1723 if (rv == KERN_SUCCESS) { 1787 /* 1724 if (user_unwire) 1788 * Allow interruption of user wiring? 1725 entry->eflags &= ˜MAP_ENTRY_USER_WIRED; 1789 */ 1726 entry->wired_count--; 1790 } 1727 if (entry->wired_count == 0) { 1791 vm_map_lock(map); 1728 /* 1792 if (last_timestamp + 1 != map->timestamp) { 1729 * Retain the map lock. 1793 /* 1730 */ 1794 * Look again for the entry because the map wa 1731 vm_fault_unwire(map, entry->start, entry->end) s ; 1795 * modified while it was unlocked. 1732 } 1796 * Specifically, the entry may have been 1733 } 1797 * clipped, merged, or deleted. 1734 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, 1798 */ 1735 ("vm_map_unwire: in-transition flag missing")); 1799 if (!vm_map_lookup_entry(map, saved_start, 1736 entry->eflags &= ˜MAP_ENTRY_IN_TRANSITION; 1800 &tmp_entry)) { 1737 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 1801 if (flags & VM_MAP_WIRE_HOLESOK) 1738 entry->eflags &= ˜MAP_ENTRY_NEEDS_WAKEUP; 1802 tmp_entry = tmp_entry->next; 1739 need_wakeup = TRUE; 1803 else { 1740 } 1804 if (saved_start == start) { 1741 vm_map_simplify_entry(map, entry); 1805 /* 1742 entry = entry->next; 1806 * first_entry has bee 1743 } n deleted. 11/19/03 10:48:45 sys/vm/vm_map.c 15 1807 */ 1867 } 1808 vm_map_unlock(map); 1868 entry = entry->next; 1809 return (KERN_INVALID_A 1869 } DDRESS); 1870 } 1810 } 1871 last_timestamp = map->timestamp; 1811 end = saved_start; 1872 if (rv != KERN_SUCCESS) { 1812 rv = KERN_INVALID_ADDRESS; 1873 KASSERT(entry->wired_count == 1, 1813 goto done; 1874 ("vm_map_wire: bad count")); 1814 } 1875 /* 1815 } 1876 * Assign an out-of-range value to represent 1816 if (entry == first_entry) 1877 * the failure to wire this entry. 1817 first_entry = tmp_entry; 1878 */ 1818 else 1879 entry->wired_count = -1; 1819 first_entry = NULL; 1880 end = entry->end; 1820 entry = tmp_entry; 1881 goto done; 1821 } 1882 } 1822 last_timestamp = map->timestamp; 1883 } else if (!user_wire || 1823 continue; 1884 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { 1824 } 1885 entry->wired_count++; 1825 vm_map_clip_start(map, entry, start); 1886 } 1826 vm_map_clip_end(map, entry, end); 1887 /* 1827 /* 1888 * Check the map for holes in the specified region. 1828 * Mark the entry in case the map lock is released. (See 1889 * If VM_MAP_WIRE_HOLESOK was specified, skip this check. 1829 * above.) 1890 */ 1830 */ 1891 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && 1831 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 1892 (entry->end < end && (entry->next == &map->header || 1832 /* 1893 entry->next->start > entry->end))) { 1833 * 1894 end = entry->end; 1834 */ 1895 rv = KERN_INVALID_ADDRESS; 1835 if (entry->wired_count == 0) { 1896 goto done; 1836 entry->wired_count++; 1897 } 1837 saved_start = entry->start; 1898 entry = entry->next; 1838 saved_end = entry->end; 1899 } 1839 /* 1900 rv = KERN_SUCCESS; 1840 * Release the map lock, relying on the in-transition 1901 done: 1841 * mark. 1902 need_wakeup = FALSE; 1842 */ 1903 if (first_entry == NULL) { 1843 vm_map_unlock(map); 1904 result = vm_map_lookup_entry(map, start, &first_entry); 1844 rv = vm_fault_wire(map, saved_start, saved_end, 1905 if (!result && (flags & VM_MAP_WIRE_HOLESOK)) 1845 user_wire); 1906 first_entry = first_entry->next; 1846 vm_map_lock(map); 1907 else 1847 if (last_timestamp + 1 != map->timestamp) { 1908 KASSERT(result, ("vm_map_wire: lookup failed")); 1848 /* 1909 } 1849 * Look again for the entry because the map wa 1910 entry = first_entry; s 1911 while (entry != &map->header && entry->start < end) { 1850 * modified while it was unlocked. The entry 1912 if (rv == KERN_SUCCESS) { 1851 * may have been clipped, but NOT merged or 1913 if (user_wire) 1852 * deleted. 1914 entry->eflags |= MAP_ENTRY_USER_WIRED; 1853 */ 1915 } else if (entry->wired_count == -1) { 1854 result = vm_map_lookup_entry(map, saved_start, 1916 /* 1855 &tmp_entry); 1917 * Wiring failed on this entry. Thus, unwiring is 1856 KASSERT(result, ("vm_map_wire: lookup failed") 1918 * unnecessary. ); 1919 */ 1857 if (entry == first_entry) 1920 entry->wired_count = 0; 1858 first_entry = tmp_entry; 1921 } else { 1859 else 1922 if (!user_wire || 1860 first_entry = NULL; 1923 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) 1861 entry = tmp_entry; 1924 entry->wired_count--; 1862 while (entry->end < saved_end) { 1925 if (entry->wired_count == 0) { 1863 if (rv != KERN_SUCCESS) { 1926 /* 1864 KASSERT(entry->wired_count == 1927 * Retain the map lock. 1, 1928 */ 1865 ("vm_map_wire: bad count") 1929 vm_fault_unwire(map, entry->start, entry->end) ); ; 1866 entry->wired_count = -1; 1930 } 11/19/03 10:48:45 sys/vm/vm_map.c 16 1931 } 1996 (current->next == &map->header || 1932 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, 1997 current->end != current->next->start)) { 1933 ("vm_map_wire: in-transition flag missing")); 1998 vm_map_unlock_read(map); 1934 entry->eflags &= ˜MAP_ENTRY_IN_TRANSITION; 1999 return (KERN_INVALID_ADDRESS); 1935 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 2000 } 1936 entry->eflags &= ˜MAP_ENTRY_NEEDS_WAKEUP; 2001 } 1937 need_wakeup = TRUE; 2002 1938 } 2003 if (invalidate) { 1939 vm_map_simplify_entry(map, entry); 2004 mtx_lock(&Giant); 1940 entry = entry->next; 2005 vm_page_lock_queues(); 1941 } 2006 pmap_remove(map->pmap, start, end); 1942 vm_map_unlock(map); 2007 vm_page_unlock_queues(); 1943 if (need_wakeup) 2008 mtx_unlock(&Giant); 1944 vm_map_wakeup(map); 2009 } 1945 return (rv); 2010 /* 1946 } 2011 * Make a second pass, cleaning/uncaching pages from the indicated 1947 2012 * objects as we go. 1948 /* 2013 */ 1949 * vm_map_sync 2014 for (current = entry; current->start < end; current = current->next) { 1950 * 2015 offset = current->offset + (start - current->start); 1951 * Push any dirty cached pages in the address range to their pager. 2016 size = (end <= current->end ? end : current->end) - start; 1952 * If syncio is TRUE, dirty pages are written synchronously. 2017 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { 1953 * If invalidate is TRUE, any cached pages are freed as well. 2018 vm_map_t smap; 1954 * 2019 vm_map_entry_t tentry; 1955 * If the size of the region from start to end is zero, we are 2020 vm_size_t tsize; 1956 * supposed to flush all modified pages within the region containing 2021 1957 * start. Unfortunately, a region can be split or coalesced with 2022 smap = current->object.sub_map; 1958 * neighboring regions, making it difficult to determine what the 2023 vm_map_lock_read(smap); 1959 * original region was. Therefore, we approximate this requirement by 2024 (void) vm_map_lookup_entry(smap, offset, &tentry); 1960 * flushing the current region containing start. 2025 tsize = tentry->end - offset; 1961 * 2026 if (tsize < size) 1962 * Returns an error if any part of the specified range is not mapped. 2027 size = tsize; 1963 */ 2028 object = tentry->object.vm_object; 1964 int 2029 offset = tentry->offset + (offset - tentry->start); 1965 vm_map_sync( 2030 vm_map_unlock_read(smap); 1966 vm_map_t map, 2031 } else { 1967 vm_offset_t start, 2032 object = current->object.vm_object; 1968 vm_offset_t end, 2033 } 1969 boolean_t syncio, 2034 vm_object_sync(object, offset, size, syncio, invalidate); 1970 boolean_t invalidate) 2035 start += size; 1971 { 2036 } 1972 vm_map_entry_t current; 2037 1973 vm_map_entry_t entry; 2038 vm_map_unlock_read(map); 1974 vm_size_t size; 2039 return (KERN_SUCCESS); 1975 vm_object_t object; 2040 } 1976 vm_ooffset_t offset; 2041 1977 2042 /* 1978 vm_map_lock_read(map); 2043 * vm_map_entry_unwire: [ internal use only ] 1979 VM_MAP_RANGE_CHECK(map, start, end); 2044 * 1980 if (!vm_map_lookup_entry(map, start, &entry)) { 2045 * Make the region specified by this entry pageable. 1981 vm_map_unlock_read(map); 2046 * 1982 return (KERN_INVALID_ADDRESS); 2047 * The map in question should be locked. 1983 } else if (start == end) { 2048 * [This is the reason for this routine’s existence.] 1984 start = entry->start; 2049 */ 1985 end = entry->end; 2050 static void 1986 } 2051 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) 1987 /* 2052 { 1988 * Make a first pass to check for user-wired memory and holes. 2053 vm_fault_unwire(map, entry->start, entry->end); 1989 */ 2054 entry->wired_count = 0; 1990 for (current = entry; current->start < end; current = current->next) { 2055 } 1991 if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { 2056 1992 vm_map_unlock_read(map); 2057 /* 1993 return (KERN_INVALID_ARGUMENT); 2058 * vm_map_entry_delete: [ internal use only ] 1994 } 2059 * 1995 if (end > current->end && 2060 * Deallocate the given entry from the target map. 11/19/03 10:48:45 sys/vm/vm_map.c 17 2061 */ 2122 map->first_free = &map->header; 2062 static void 2123 } else if (map->first_free->start >= start) { 2063 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) 2124 map->first_free = entry->prev; 2064 { 2125 } 2065 vm_object_t object; 2126 2066 vm_pindex_t offidxstart, offidxend, count; 2127 /* 2067 2128 * Step through all entries in this region 2068 vm_map_entry_unlink(map, entry); 2129 */ 2069 map->size -= entry->end - entry->start; 2130 while ((entry != &map->header) && (entry->start < end)) { 2070 2131 vm_map_entry_t next; 2071 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && 2132 2072 (object = entry->object.vm_object) != NULL) { 2133 /* 2073 count = OFF_TO_IDX(entry->end - entry->start); 2134 * Wait for wiring or unwiring of an entry to complete. 2074 offidxstart = OFF_TO_IDX(entry->offset); 2135 */ 2075 offidxend = offidxstart + count; 2136 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0) { 2076 VM_OBJECT_LOCK(object); 2137 unsigned int last_timestamp; 2077 if (object->ref_count != 1 && 2138 vm_offset_t saved_start; 2078 ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONE 2139 vm_map_entry_t tmp_entry; MAPPING || 2140 2079 object == kernel_object || object == kmem_object) && 2141 saved_start = entry->start; 2080 (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP 2142 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; )) { 2143 last_timestamp = map->timestamp; 2081 vm_object_collapse(object); 2144 (void) vm_map_unlock_and_wait(map, FALSE); 2082 vm_object_page_remove(object, offidxstart, offidxend, 2145 vm_map_lock(map); FALSE); 2146 if (last_timestamp + 1 != map->timestamp) { 2083 if (object->type == OBJT_SWAP) 2147 /* 2084 swap_pager_freespace(object, offidxstart, coun 2148 * Look again for the entry because the map wa t); s 2085 if (offidxend >= object->size && 2149 * modified while it was unlocked. 2086 offidxstart < object->size) 2150 * Specifically, the entry may have been 2087 object->size = offidxstart; 2151 * clipped, merged, or deleted. 2088 } 2152 */ 2089 VM_OBJECT_UNLOCK(object); 2153 if (!vm_map_lookup_entry(map, saved_start, 2090 vm_object_deallocate(object); 2154 &tmp_entry)) 2091 } 2155 entry = tmp_entry->next; 2092 2156 else { 2093 vm_map_entry_dispose(map, entry); 2157 entry = tmp_entry; 2094 } 2158 vm_map_clip_start(map, entry, 2095 2159 saved_start); 2096 /* 2160 } 2097 * vm_map_delete: [ internal use only ] 2161 } 2098 * 2162 continue; 2099 * Deallocates the given address range from the target 2163 } 2100 * map. 2164 vm_map_clip_end(map, entry, end); 2101 */ 2165 2102 int 2166 next = entry->next; 2103 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) 2167 2104 { 2168 /* 2105 vm_map_entry_t entry; 2169 * Unwire before removing addresses from the pmap; otherwise, 2106 vm_map_entry_t first_entry; 2170 * unwiring will put the entries back in the pmap. 2107 2171 */ 2108 /* 2172 if (entry->wired_count != 0) { 2109 * Find the start of the region, and clip it 2173 vm_map_entry_unwire(map, entry); 2110 */ 2174 } 2111 if (!vm_map_lookup_entry(map, start, &first_entry)) 2175 2112 entry = first_entry->next; 2176 if (map != kmem_map) 2113 else { 2177 mtx_lock(&Giant); 2114 entry = first_entry; 2178 vm_page_lock_queues(); 2115 vm_map_clip_start(map, entry, start); 2179 pmap_remove(map->pmap, entry->start, entry->end); 2116 } 2180 vm_page_unlock_queues(); 2117 2181 if (map != kmem_map) 2118 /* 2182 mtx_unlock(&Giant); 2119 * Save the free space hint 2183 2120 */ 2184 /* 2121 if (entry == &map->header) { 2185 * Delete the entry (which may delete the object) only after 11/19/03 10:48:45 sys/vm/vm_map.c 18 2186 * removing all pmap entries pointing to its pages. 2251 */ 2187 * (Otherwise, its page frames may be reallocated, and any 2252 if (start < entry->start) 2188 * modify bits will be set in the wrong object!) 2253 return (FALSE); 2189 */ 2254 /* 2190 vm_map_entry_delete(map, entry); 2255 * Check protection associated with entry. 2191 entry = next; 2256 */ 2192 } 2257 if ((entry->protection & protection) != protection) 2193 return (KERN_SUCCESS); 2258 return (FALSE); 2194 } 2259 /* go to next entry */ 2195 2260 start = entry->end; 2196 /* 2261 entry = entry->next; 2197 * vm_map_remove: 2262 } 2198 * 2263 return (TRUE); 2199 * Remove the given address range from the target map. 2264 } 2200 * This is the exported form of vm_map_delete. 2265 2201 */ 2266 /* 2202 int 2267 * vm_map_copy_entry: 2203 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) 2268 * 2204 { 2269 * Copies the contents of the source entry to the destination 2205 int result, s = 0; 2270 * entry. The entries *must* be aligned properly. 2206 2271 */ 2207 if (map == kmem_map) 2272 static void 2208 s = splvm(); 2273 vm_map_copy_entry( 2209 2274 vm_map_t src_map, 2210 vm_map_lock(map); 2275 vm_map_t dst_map, 2211 VM_MAP_RANGE_CHECK(map, start, end); 2276 vm_map_entry_t src_entry, 2212 result = vm_map_delete(map, start, end); 2277 vm_map_entry_t dst_entry) 2213 vm_map_unlock(map); 2278 { 2214 2279 vm_object_t src_object; 2215 if (map == kmem_map) 2280 2216 splx(s); 2281 if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) 2217 2282 return; 2218 return (result); 2283 2219 } 2284 if (src_entry->wired_count == 0) { 2220 2285 2221 /* 2286 /* 2222 * vm_map_check_protection: 2287 * If the source entry is marked needs_copy, it is already 2223 * 2288 * write-protected. 2224 * Assert that the target map allows the specified privilege on the 2289 */ 2225 * entire address region given. The entire region must be allocated. 2290 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { 2226 * 2291 vm_page_lock_queues(); 2227 * WARNING! This code does not and should not check whether the 2292 pmap_protect(src_map->pmap, 2228 * contents of the region is accessible. For example a smaller file 2293 src_entry->start, 2229 * might be mapped into a larger address space. 2294 src_entry->end, 2230 * 2295 src_entry->protection & ˜VM_PROT_WRITE); 2231 * NOTE! This code is also called by munmap(). 2296 vm_page_unlock_queues(); 2232 * 2297 } 2233 * The map must be locked. A read lock is sufficient. 2298 2234 */ 2299 /* 2235 boolean_t 2300 * Make a copy of the object. 2236 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, 2301 */ 2237 vm_prot_t protection) 2302 if ((src_object = src_entry->object.vm_object) != NULL) { 2238 { 2303 VM_OBJECT_LOCK(src_object); 2239 vm_map_entry_t entry; 2304 if ((src_object->handle == NULL) && 2240 vm_map_entry_t tmp_entry; 2305 (src_object->type == OBJT_DEFAULT || 2241 2306 src_object->type == OBJT_SWAP)) { 2242 if (!vm_map_lookup_entry(map, start, &tmp_entry)) 2307 vm_object_collapse(src_object); 2243 return (FALSE); 2308 if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEM 2244 entry = tmp_entry; APPING)) == OBJ_ONEMAPPING) { 2245 2309 VM_OBJECT_UNLOCK(src_object); 2246 while (start < end) { 2310 vm_object_split(src_entry); 2247 if (entry == &map->header) 2311 src_object = src_entry->object.vm_obje 2248 return (FALSE); ct; 2249 /* 2312 VM_OBJECT_LOCK(src_object); 2250 * No holes allowed! 2313 } 11/19/03 10:48:45 sys/vm/vm_map.c 19 2314 } 2377 panic("vm_map_fork: encountered a submap"); 2315 vm_object_reference_locked(src_object); 2378 2316 vm_object_clear_flag(src_object, OBJ_ONEMAPPING); 2379 switch (old_entry->inheritance) { 2317 VM_OBJECT_UNLOCK(src_object); 2380 case VM_INHERIT_NONE: 2318 dst_entry->object.vm_object = src_object; 2381 break; 2319 src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_CO 2382 PY); 2383 case VM_INHERIT_SHARE: 2320 dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_CO 2384 /* PY); 2385 * Clone the entry, creating the shared object if nece 2321 dst_entry->offset = src_entry->offset; ssary. 2322 } else { 2386 */ 2323 dst_entry->object.vm_object = NULL; 2387 object = old_entry->object.vm_object; 2324 dst_entry->offset = 0; 2388 if (object == NULL) { 2325 } 2389 object = vm_object_allocate(OBJT_DEFAULT, 2326 2390 atop(old_entry->end - old_entry->start 2327 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, )); 2328 dst_entry->end - dst_entry->start, src_entry->start); 2391 old_entry->object.vm_object = object; 2329 } else { 2392 old_entry->offset = (vm_offset_t) 0; 2330 /* 2393 } 2331 * Of course, wired down pages can’t be set copy-on-write. 2394 2332 * Cause wired pages to be copied into the new map by 2395 /* 2333 * simulating faults (the new pages are pageable) 2396 * Add the reference before calling vm_object_shadow 2334 */ 2397 * to insure that a shadow object is created. 2335 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); 2398 */ 2336 } 2399 vm_object_reference(object); 2337 } 2400 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { 2338 2401 vm_object_shadow(&old_entry->object.vm_object, 2339 /* 2402 &old_entry->offset, 2340 * vmspace_fork: 2403 atop(old_entry->end - old_entry->start 2341 * Create a new process vmspace structure and vm_map )); 2342 * based on those of an existing process. The new map 2404 old_entry->eflags &= ˜MAP_ENTRY_NEEDS_COPY; 2343 * is based on the old map, according to the inheritance 2405 /* Transfer the second reference too. */ 2344 * values on the regions in that map. 2406 vm_object_reference( 2345 * 2407 old_entry->object.vm_object); 2346 * The source map must not be locked. 2408 vm_object_deallocate(object); 2347 */ 2409 object = old_entry->object.vm_object; 2348 struct vmspace * 2410 } 2349 vmspace_fork(struct vmspace *vm1) 2411 VM_OBJECT_LOCK(object); 2350 { 2412 vm_object_clear_flag(object, OBJ_ONEMAPPING); 2351 struct vmspace *vm2; 2413 VM_OBJECT_UNLOCK(object); 2352 vm_map_t old_map = &vm1->vm_map; 2414 2353 vm_map_t new_map; 2415 /* 2354 vm_map_entry_t old_entry; 2416 * Clone the entry, referencing the shared object. 2355 vm_map_entry_t new_entry; 2417 */ 2356 vm_object_t object; 2418 new_entry = vm_map_entry_create(new_map); 2357 2419 *new_entry = *old_entry; 2358 GIANT_REQUIRED; 2420 new_entry->eflags &= ˜MAP_ENTRY_USER_WIRED; 2359 2421 new_entry->wired_count = 0; 2360 vm_map_lock(old_map); 2422 2361 old_map->infork = 1; 2423 /* 2362 2424 * Insert the entry into the new map -- we know we’re 2363 vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset); 2425 * inserting at the end of the new map. 2364 bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, 2426 */ 2365 (caddr_t) &vm1->vm_endcopy - (caddr_t) &vm1->vm_startcopy); 2427 vm_map_entry_link(new_map, new_map->header.prev, 2366 new_map = &vm2->vm_map; /* XXX */ 2428 new_entry); 2367 new_map->timestamp = 1; 2429 2368 2430 /* 2369 /* Do not inherit the MAP_WIREFUTURE property. */ 2431 * Update the physical map 2370 if ((new_map->flags & MAP_WIREFUTURE) == MAP_WIREFUTURE) 2432 */ 2371 new_map->flags &= ˜MAP_WIREFUTURE; 2433 pmap_copy(new_map->pmap, old_map->pmap, 2372 2434 new_entry->start, 2373 old_entry = old_map->header.next; 2435 (old_entry->end - old_entry->start), 2374 2436 old_entry->start); 2375 while (old_entry != &old_map->header) { 2437 break; 2376 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) 2438 11/19/03 10:48:45 sys/vm/vm_map.c 20 2439 case VM_INHERIT_COPY: 2504 * If we can’t accomodate max_ssize in the current mapping, no go. 2440 /* 2505 * However, we need to be aware that subsequent user mappings might 2441 * Clone the entry and link into the map. 2506 * map into the space we have reserved for stack, and currently this 2442 */ 2507 * space is not protected. 2443 new_entry = vm_map_entry_create(new_map); 2508 * 2444 *new_entry = *old_entry; 2509 * Hopefully we will at least detect this condition when we try to 2445 new_entry->eflags &= ˜MAP_ENTRY_USER_WIRED; 2510 * grow the stack. 2446 new_entry->wired_count = 0; 2511 */ 2447 new_entry->object.vm_object = NULL; 2512 if ((prev_entry->next != &map->header) && 2448 vm_map_entry_link(new_map, new_map->header.prev, 2513 (prev_entry->next->start < addrbos + max_ssize)) { 2449 new_entry); 2514 vm_map_unlock(map); 2450 vm_map_copy_entry(old_map, new_map, old_entry, 2515 return (KERN_NO_SPACE); 2451 new_entry); 2516 } 2452 break; 2517 2453 } 2518 /* 2454 old_entry = old_entry->next; 2519 * We initially map a stack of only init_ssize. We will grow as 2455 } 2520 * needed later. Depending on the orientation of the stack (i.e. 2456 2521 * the grow direction) we either map at the top of the range, the 2457 new_map->size = old_map->size; 2522 * bottom of the range or in the middle. 2458 old_map->infork = 0; 2523 * 2459 vm_map_unlock(old_map); 2524 * Note: we would normally expect prot and max to be VM_PROT_ALL, 2460 2525 * and cow to be 0. Possibly we should eliminate these as input 2461 return (vm2); 2526 * parameters, and just pass these values here in the insert call. 2462 } 2527 */ 2463 2528 if (orient == MAP_STACK_GROWS_DOWN) 2464 int 2529 bot = addrbos + max_ssize - init_ssize; 2465 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, 2530 else if (orient == MAP_STACK_GROWS_UP) 2466 vm_prot_t prot, vm_prot_t max, int cow) 2531 bot = addrbos; 2467 { 2532 else 2468 vm_map_entry_t new_entry, prev_entry; 2533 bot = round_page(addrbos + max_ssize/2 - init_ssize/2); 2469 vm_offset_t bot, top; 2534 top = bot + init_ssize; 2470 vm_size_t init_ssize; 2535 rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); 2471 int orient, rv; 2536 2472 2537 /* Now set the avail_ssize amount. */ 2473 /* 2538 if (rv == KERN_SUCCESS) { 2474 * The stack orientation is piggybacked with the cow argument. 2539 if (prev_entry != &map->header) 2475 * Extract it into orient and mask the cow argument so that we 2540 vm_map_clip_end(map, prev_entry, bot); 2476 * don’t pass it around further. 2541 new_entry = prev_entry->next; 2477 * NOTE: We explicitly allow bi-directional stacks. 2542 if (new_entry->end != top || new_entry->start != bot) 2478 */ 2543 panic("Bad entry start/end for new stack entry"); 2479 orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP); 2544 2480 cow &= ˜orient; 2545 new_entry->avail_ssize = max_ssize - init_ssize; 2481 KASSERT(orient != 0, ("No stack grow direction")); 2546 if (orient & MAP_STACK_GROWS_DOWN) 2482 2547 new_entry->eflags |= MAP_ENTRY_GROWS_DOWN; 2483 if (addrbos < vm_map_min(map) || addrbos > map->max_offset) 2548 if (orient & MAP_STACK_GROWS_UP) 2484 return (KERN_NO_SPACE); 2549 new_entry->eflags |= MAP_ENTRY_GROWS_UP; 2485 2550 } 2486 init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz; 2551 2487 2552 vm_map_unlock(map); 2488 vm_map_lock(map); 2553 return (rv); 2489 2554 } 2490 /* If addr is already mapped, no go */ 2555 2491 if (vm_map_lookup_entry(map, addrbos, &prev_entry)) { 2556 /* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the 2492 vm_map_unlock(map); 2557 * desired address is already mapped, or if we successfully grow 2493 return (KERN_NO_SPACE); 2558 * the stack. Also returns KERN_SUCCESS if addr is outside the 2494 } 2559 * stack range (this is strange, but preserves compatibility with 2495 2560 * the grow function in vm_machdep.c). 2496 /* If we would blow our VMEM resource limit, no go */ 2561 */ 2497 if (map->size + init_ssize > 2562 int 2498 curthread->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) { 2563 vm_map_growstack(struct proc *p, vm_offset_t addr) 2499 vm_map_unlock(map); 2564 { 2500 return (KERN_NO_SPACE); 2565 vm_map_entry_t next_entry, prev_entry; 2501 } 2566 vm_map_entry_t new_entry, stack_entry; 2502 2567 struct vmspace *vm = p->p_vmspace; 2503 /* 2568 vm_map_t map = &vm->vm_map; 11/19/03 10:48:45 sys/vm/vm_map.c 21 2569 vm_offset_t end; 2633 if (grow_amount > stack_entry->avail_ssize) { 2570 size_t grow_amount, max_grow; 2634 vm_map_unlock_read(map); 2571 int is_procstack, rv; 2635 return (KERN_NO_SPACE); 2572 2636 } 2573 GIANT_REQUIRED; 2637 2574 2638 /* 2575 Retry: 2639 * If there is no longer enough space between the entries nogo, and 2576 vm_map_lock_read(map); 2640 * adjust the available space. Note: this should only happen if the 2577 2641 * user has mapped into the stack area after the stack was created, 2578 /* If addr is already in the entry range, no need to grow.*/ 2642 * and is probably an error. 2579 if (vm_map_lookup_entry(map, addr, &prev_entry)) { 2643 * 2580 vm_map_unlock_read(map); 2644 * This also effectively destroys any guard page the user might have 2581 return (KERN_SUCCESS); 2645 * intended by limiting the stack size. 2582 } 2646 */ 2583 2647 if (grow_amount > max_grow) { 2584 next_entry = prev_entry->next; 2648 if (vm_map_lock_upgrade(map)) 2585 if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) { 2649 goto Retry; 2586 /* 2650 2587 * This entry does not grow upwards. Since the address lies 2651 stack_entry->avail_ssize = max_grow; 2588 * beyond this entry, the next entry (if one exists) has to 2652 2589 * be a downward growable entry. The entry list header is 2653 vm_map_unlock(map); 2590 * never a growable entry, so it suffices to check the flags. 2654 return (KERN_NO_SPACE); 2591 */ 2655 } 2592 if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) { 2656 2593 vm_map_unlock_read(map); 2657 is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0; 2594 return (KERN_SUCCESS); 2658 2595 } 2659 /* 2596 stack_entry = next_entry; 2660 * If this is the main process stack, see if we’re over the stack 2597 } else { 2661 * limit. 2598 /* 2662 */ 2599 * This entry grows upward. If the next entry does not at 2663 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > 2600 * least grow downwards, this is the entry we need to grow. 2664 p->p_rlimit[RLIMIT_STACK].rlim_cur)) { 2601 * otherwise we have two possible choices and we have to 2665 vm_map_unlock_read(map); 2602 * select one. 2666 return (KERN_NO_SPACE); 2603 */ 2667 } 2604 if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) { 2668 2605 /* 2669 /* Round up the grow amount modulo SGROWSIZ */ 2606 * We have two choices; grow the entry closest to 2670 grow_amount = roundup (grow_amount, sgrowsiz); 2607 * the address to minimize the amount of growth. 2671 if (grow_amount > stack_entry->avail_ssize) 2608 */ 2672 grow_amount = stack_entry->avail_ssize; 2609 if (addr - prev_entry->end <= next_entry->start - addr 2673 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > ) 2674 p->p_rlimit[RLIMIT_STACK].rlim_cur)) { 2610 stack_entry = prev_entry; 2675 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - 2611 else 2676 ctob(vm->vm_ssize); 2612 stack_entry = next_entry; 2677 } 2613 } else 2678 2614 stack_entry = prev_entry; 2679 /* If we would blow our VMEM resource limit, no go */ 2615 } 2680 if (map->size + grow_amount > 2616 2681 curthread->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) { 2617 if (stack_entry == next_entry) { 2682 vm_map_unlock_read(map); 2618 KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo")); 2683 return (KERN_NO_SPACE); 2619 KASSERT(addr < stack_entry->start, ("foo")); 2684 } 2620 end = (prev_entry != &map->header) ? prev_entry->end : 2685 2621 stack_entry->start - stack_entry->avail_ssize; 2686 if (vm_map_lock_upgrade(map)) 2622 grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE); 2687 goto Retry; 2623 max_grow = stack_entry->start - end; 2688 2624 } else { 2689 if (stack_entry == next_entry) { 2625 KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo")); 2690 /* 2626 KASSERT(addr >= stack_entry->end, ("foo")); 2691 * Growing downward. 2627 end = (next_entry != &map->header) ? next_entry->start : 2692 */ 2628 stack_entry->end + stack_entry->avail_ssize; 2693 /* Get the preliminary new entry start value */ 2629 grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE); 2694 addr = stack_entry->start - grow_amount; 2630 max_grow = end - stack_entry->end; 2695 2631 } 2696 /* 2632 2697 * If this puts us into the previous entry, cut back our 11/19/03 10:48:45 sys/vm/vm_map.c 22 2698 * growth to the available space. Also, see the note above. 2762 /* 2699 */ 2763 * Heed the MAP_WIREFUTURE flag if it was set for this process. 2700 if (addr < end) { 2764 */ 2701 stack_entry->avail_ssize = max_grow; 2765 if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) { 2702 addr = end; 2766 vm_map_wire(map, 2703 } 2767 (stack_entry == next_entry) ? addr : addr - grow_amount, 2704 2768 (stack_entry == next_entry) ? stack_entry->start : addr, 2705 rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start, 2769 (p->p_flag & P_SYSTEM) 2706 p->p_sysent->sv_stackprot, VM_PROT_ALL, 0); 2770 ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES 2707 2771 : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); 2708 /* Adjust the available stack space by the amount we grew. */ 2772 } 2709 if (rv == KERN_SUCCESS) { 2773 2710 if (prev_entry != &map->header) 2774 return (rv); 2711 vm_map_clip_end(map, prev_entry, addr); 2775 } 2712 new_entry = prev_entry->next; 2776 2713 KASSERT(new_entry == stack_entry->prev, ("foo")); 2777 /* 2714 KASSERT(new_entry->end == stack_entry->start, ("foo")) 2778 * Unshare the specified VM space for exec. If other processes are ; 2779 * mapped to it, then create a new one. The new vmspace is null. 2715 KASSERT(new_entry->start == addr, ("foo")); 2780 */ 2716 grow_amount = new_entry->end - new_entry->start; 2781 void 2717 new_entry->avail_ssize = stack_entry->avail_ssize - 2782 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) 2718 grow_amount; 2783 { 2719 stack_entry->eflags &= ˜MAP_ENTRY_GROWS_DOWN; 2784 struct vmspace *oldvmspace = p->p_vmspace; 2720 new_entry->eflags |= MAP_ENTRY_GROWS_DOWN; 2785 struct vmspace *newvmspace; 2721 } 2786 2722 } else { 2787 GIANT_REQUIRED; 2723 /* 2788 newvmspace = vmspace_alloc(minuser, maxuser); 2724 * Growing upward. 2789 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy, 2725 */ 2790 (caddr_t) (newvmspace + 1) - (caddr_t) &newvmspace->vm_startcopy); 2726 addr = stack_entry->end + grow_amount; 2791 /* 2727 2792 * This code is written like this for prototype purposes. The 2728 /* 2793 * goal is to avoid running down the vmspace here, but let the 2729 * If this puts us into the next entry, cut back our growth 2794 * other process’s that are still using the vmspace to finally 2730 * to the available space. Also, see the note above. 2795 * run it down. Even though there is little or no chance of blocking 2731 */ 2796 * here, it is a good idea to keep this form for future mods. 2732 if (addr > end) { 2797 */ 2733 stack_entry->avail_ssize = end - stack_entry->end; 2798 p->p_vmspace = newvmspace; 2734 addr = end; 2799 pmap_pinit2(vmspace_pmap(newvmspace)); 2735 } 2800 vmspace_free(oldvmspace); 2736 2801 if (p == curthread->td_proc) /* XXXKSE ? */ 2737 grow_amount = addr - stack_entry->end; 2802 pmap_activate(curthread); 2738 2803 } 2739 /* Grow the underlying object if applicable. */ 2804 2740 if (stack_entry->object.vm_object == NULL || 2805 /* 2741 vm_object_coalesce(stack_entry->object.vm_object, 2806 * Unshare the specified VM space for forcing COW. This 2742 OFF_TO_IDX(stack_entry->offset), 2807 * is called by rfork, for the (RFMEM|RFPROC) == 0 case. 2743 (vm_size_t)(stack_entry->end - stack_entry->start), 2808 */ 2744 (vm_size_t)grow_amount)) { 2809 void 2745 map->size += (addr - stack_entry->end); 2810 vmspace_unshare(struct proc *p) 2746 /* Update the current entry. */ 2811 { 2747 stack_entry->end = addr; 2812 struct vmspace *oldvmspace = p->p_vmspace; 2748 stack_entry->avail_ssize -= grow_amount; 2813 struct vmspace *newvmspace; 2749 rv = KERN_SUCCESS; 2814 2750 2815 GIANT_REQUIRED; 2751 if (next_entry != &map->header) 2816 if (oldvmspace->vm_refcnt == 1) 2752 vm_map_clip_start(map, next_entry, addr); 2817 return; 2753 } else 2818 newvmspace = vmspace_fork(oldvmspace); 2754 rv = KERN_FAILURE; 2819 p->p_vmspace = newvmspace; 2755 } 2820 pmap_pinit2(vmspace_pmap(newvmspace)); 2756 2821 vmspace_free(oldvmspace); 2757 if (rv == KERN_SUCCESS && is_procstack) 2822 if (p == curthread->td_proc) /* XXXKSE ? */ 2758 vm->vm_ssize += btoc(grow_amount); 2823 pmap_activate(curthread); 2759 2824 } 2760 vm_map_unlock(map); 2825 2761 2826 /* 11/19/03 10:48:45 sys/vm/vm_map.c 23 2827 * vm_map_lookup: 2892 2828 * 2893 /* 2829 * Finds the VM object, offset, and 2894 * Handle submaps. 2830 * protection for a given virtual address in the 2895 */ 2831 * specified map, assuming a page fault of the 2896 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { 2832 * type specified. 2897 vm_map_t old_map = map; 2833 * 2898 2834 * Leaves the map in question locked for read; return 2899 *var_map = map = entry->object.sub_map; 2835 * values are guaranteed until a vm_map_lookup_done 2900 vm_map_unlock_read(old_map); 2836 * call is performed. Note that the map argument 2901 goto RetryLookup; 2837 * is in/out; the returned map must be used in 2902 } 2838 * the call to vm_map_lookup_done. 2903 2839 * 2904 /* 2840 * A handle (out_entry) is returned for use in 2905 * Check whether this task is allowed to have this page. 2841 * vm_map_lookup_done, to make that fast. 2906 * Note the special case for MAP_ENTRY_COW 2842 * 2907 * pages with an override. This is to implement a forced 2843 * If a lookup is requested with "write protection" 2908 * COW for debuggers. 2844 * specified, the map may be changed to perform virtual 2909 */ 2845 * copying operations, although the data referenced will 2910 if (fault_type & VM_PROT_OVERRIDE_WRITE) 2846 * remain the same. 2911 prot = entry->max_protection; 2847 */ 2912 else 2848 int 2913 prot = entry->protection; 2849 vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ 2914 fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); 2850 vm_offset_t vaddr, 2915 if ((fault_type & prot) != fault_type) { 2851 vm_prot_t fault_typea, 2916 RETURN(KERN_PROTECTION_FAILURE); 2852 vm_map_entry_t *out_entry, /* OUT */ 2917 } 2853 vm_object_t *object, /* OUT */ 2918 if ((entry->eflags & MAP_ENTRY_USER_WIRED) && 2854 vm_pindex_t *pindex, /* OUT */ 2919 (entry->eflags & MAP_ENTRY_COW) && 2855 vm_prot_t *out_prot, /* OUT */ 2920 (fault_type & VM_PROT_WRITE) && 2856 boolean_t *wired) /* OUT */ 2921 (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) { 2857 { 2922 RETURN(KERN_PROTECTION_FAILURE); 2858 vm_map_entry_t entry; 2923 } 2859 vm_map_t map = *var_map; 2924 2860 vm_prot_t prot; 2925 /* 2861 vm_prot_t fault_type = fault_typea; 2926 * If this page is not pageable, we have to get it for all possible 2862 2927 * accesses. 2863 RetryLookup:; 2928 */ 2864 /* 2929 *wired = (entry->wired_count != 0); 2865 * Lookup the faulting address. 2930 if (*wired) 2866 */ 2931 prot = fault_type = entry->protection; 2867 2932 2868 vm_map_lock_read(map); 2933 /* 2869 #define RETURN(why) \ 2934 * If the entry was copy-on-write, we either ... 2870 { \ 2935 */ 2871 vm_map_unlock_read(map); \ 2936 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { 2872 return (why); \ 2937 /* 2873 } 2938 * If we want to write the page, we may as well handle that 2874 2939 * now since we’ve got the map locked. 2875 /* 2940 * 2876 * If the map has an interesting hint, try it before calling full 2941 * If we don’t need to write the page, we just demote the 2877 * blown lookup routine. 2942 * permissions allowed. 2878 */ 2943 */ 2879 entry = map->root; 2944 if (fault_type & VM_PROT_WRITE) { 2880 *out_entry = entry; 2945 /* 2881 if (entry == NULL || 2946 * Make a new object, and place it in the object 2882 (vaddr < entry->start) || (vaddr >= entry->end)) { 2947 * chain. Note that no new references have appeared 2883 /* 2948 * -- one just moved from the map to the new 2884 * Entry was either not a valid hint, or the vaddr was not 2949 * object. 2885 * contained in the entry, so do a full lookup. 2950 */ 2886 */ 2951 if (vm_map_lock_upgrade(map)) 2887 if (!vm_map_lookup_entry(map, vaddr, out_entry)) 2952 goto RetryLookup; 2888 RETURN(KERN_INVALID_ADDRESS); 2953 2889 2954 vm_object_shadow( 2890 entry = *out_entry; 2955 &entry->object.vm_object, 2891 } 2956 &entry->offset, 11/19/03 10:48:45 sys/vm/vm_map.c 24 2957 atop(entry->end - entry->start)); 3022 */ 2958 entry->eflags &= ˜MAP_ENTRY_NEEDS_COPY; 3023 DB_SHOW_COMMAND(map, vm_map_print) 2959 3024 { 2960 vm_map_lock_downgrade(map); 3025 static int nlines; 2961 } else { 3026 /* XXX convert args. */ 2962 /* 3027 vm_map_t map = (vm_map_t)addr; 2963 * We’re attempting to read a copy-on-write page -- 3028 boolean_t full = have_addr; 2964 * don’t allow writes. 3029 2965 */ 3030 vm_map_entry_t entry; 2966 prot &= ˜VM_PROT_WRITE; 3031 2967 } 3032 db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", 2968 } 3033 (void *)map, 2969 3034 (void *)map->pmap, map->nentries, map->timestamp); 2970 /* 3035 nlines++; 2971 * Create an object if necessary. 3036 2972 */ 3037 if (!full && db_indent) 2973 if (entry->object.vm_object == NULL && 3038 return; 2974 !map->system_map) { 3039 2975 if (vm_map_lock_upgrade(map)) 3040 db_indent += 2; 2976 goto RetryLookup; 3041 for (entry = map->header.next; entry != &map->header; 2977 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, 3042 entry = entry->next) { 2978 atop(entry->end - entry->start)); 3043 db_iprintf("map entry %p: start=%p, end=%p\n", 2979 entry->offset = 0; 3044 (void *)entry, (void *)entry->start, (void *)entry->end); 2980 vm_map_lock_downgrade(map); 3045 nlines++; 2981 } 3046 { 2982 3047 static char *inheritance_name[4] = 2983 /* 3048 {"share", "copy", "none", "donate_copy"}; 2984 * Return the object/offset from this entry. If the entry was 3049 2985 * copy-on-write or empty, it has been fixed up. 3050 db_iprintf(" prot=%x/%x/%s", 2986 */ 3051 entry->protection, 2987 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); 3052 entry->max_protection, 2988 *object = entry->object.vm_object; 3053 inheritance_name[(int)(unsigned char)entry->inheri 2989 tance]); 2990 /* 3054 if (entry->wired_count != 0) 2991 * Return whether this is the only map sharing this data. 3055 db_printf(", wired"); 2992 */ 3056 } 2993 *out_prot = prot; 3057 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { 2994 return (KERN_SUCCESS); 3058 db_printf(", share=%p, offset=0x%jx\n", 2995 3059 (void *)entry->object.sub_map, 2996 #undef RETURN 3060 (uintmax_t)entry->offset); 2997 } 3061 nlines++; 2998 3062 if ((entry->prev == &map->header) || 2999 /* 3063 (entry->prev->object.sub_map != 3000 * vm_map_lookup_done: 3064 entry->object.sub_map)) { 3001 * 3065 db_indent += 2; 3002 * Releases locks acquired by a vm_map_lookup 3066 vm_map_print((db_expr_t)(intptr_t) 3003 * (according to the handle returned by that lookup). 3067 entry->object.sub_map, 3004 */ 3068 full, 0, (char *)0); 3005 void 3069 db_indent -= 2; 3006 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) 3070 } 3007 { 3071 } else { 3008 /* 3072 db_printf(", object=%p, offset=0x%jx", 3009 * Unlock the main-level map 3073 (void *)entry->object.vm_object, 3010 */ 3074 (uintmax_t)entry->offset); 3011 vm_map_unlock_read(map); 3075 if (entry->eflags & MAP_ENTRY_COW) 3012 } 3076 db_printf(", copy (%s)", 3013 3077 (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? " 3014 #include "opt_ddb.h" needed" : "done"); 3015 #ifdef DDB 3078 db_printf("\n"); 3016 #include 3079 nlines++; 3017 3080 3018 #include 3081 if ((entry->prev == &map->header) || 3019 3082 (entry->prev->object.vm_object != 3020 /* 3083 entry->object.vm_object)) { 3021 * vm_map_print: [ debug ] 3084 db_indent += 2; 11/19/03 10:48:45 sys/vm/vm_map.c 25 3085 vm_object_print((db_expr_t)(intptr_t) 3086 entry->object.vm_object, 3087 full, 0, (char *)0); 3088 nlines += 4; 3089 db_indent -= 2; 3090 } 3091 } 3092 } 3093 db_indent -= 2; 3094 if (db_indent == 0) 3095 nlines = 0; 3096 } 3097 3098 3099 DB_SHOW_COMMAND(procvm, procvm) 3100 { 3101 struct proc *p; 3102 3103 if (have_addr) { 3104 p = (struct proc *) addr; 3105 } else { 3106 p = curproc; 3107 } 3108 3109 db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", 3110 (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, 3111 (void *)vmspace_pmap(p->p_vmspace)); 3112 3113 vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL); 3114 } 3115 3116 #endif /* DDB */