Author: arty Date: Thu Mar 29 06:01:52 2012 New Revision: 56268
URL: http://svn.reactos.org/svn/reactos?rev=56268&view=rev Log: [NEWCC]
Add some prose describing this functionality.
Dedicated to timo, chongo, goto and ??=
Just formatting and comments.
Modified: trunk/reactos/ntoskrnl/cache/copysup.c trunk/reactos/ntoskrnl/cache/fssup.c trunk/reactos/ntoskrnl/cache/pinsup.c
Modified: trunk/reactos/ntoskrnl/cache/copysup.c URL: http://svn.reactos.org/svn/reactos/trunk/reactos/ntoskrnl/cache/copysup.c?re... ============================================================================== --- trunk/reactos/ntoskrnl/cache/copysup.c [iso-8859-1] (original) +++ trunk/reactos/ntoskrnl/cache/copysup.c [iso-8859-1] Thu Mar 29 06:01:52 2012 @@ -28,6 +28,16 @@
/* FUNCTIONS ******************************************************************/
+/* + +CcCopyRead can be called for a region of any size and alignment, so we must +crawl the cache space, focusing one cache stripe after another and using +RtlCopyMemory to copy the input data into the cache. In constrained memory, +pages faulted into new stripes are often taken from old stripes, causing the +old stripes to be flushed right away. In the case of many short buffered in +order writes, like the ones generated by stdio, this can be really efficient. + +*/ BOOLEAN NTAPI CcCopyRead(IN PFILE_OBJECT FileObject,
Modified: trunk/reactos/ntoskrnl/cache/fssup.c URL: http://svn.reactos.org/svn/reactos/trunk/reactos/ntoskrnl/cache/fssup.c?rev=... ============================================================================== --- trunk/reactos/ntoskrnl/cache/fssup.c [iso-8859-1] (original) +++ trunk/reactos/ntoskrnl/cache/fssup.c [iso-8859-1] Thu Mar 29 06:01:52 2012 @@ -27,6 +27,45 @@ CLIENT_ID CcUnmapThreadId, CcLazyWriteThreadId; FAST_MUTEX GlobalPageOperation;
+/* + +A note about private cache maps. + +CcInitializeCacheMap and CcUninitializeCacheMap are not meant to be paired, +although they can work that way. + +The actual operation I've gleaned from reading both jan kratchovil's writing +and real filesystems is this: + +CcInitializeCacheMap means: + +Make the indicated FILE_OBJECT have a private cache map if it doesn't already +and make it have a shared cache map if it doesn't already. + +CcUninitializeCacheMap means: + +Take away the private cache map from this FILE_OBJECT. If it's the last +private cache map corresponding to a specific shared cache map (the one that +was present in the FILE_OBJECT when it was created), then delete that too, +flusing all cached information. + +Using these simple semantics, filesystems can do all the things they actually +do: + +- Copy out the shared cache map pointer from a newly initialized file object +and store it in the fcb cache. +- Copy it back into any file object and call CcInitializeCacheMap to make +that file object be associated with the caching of all the other siblings. +- Call CcUninitializeCacheMap on a FILE_OBJECT many times, but have only the +first one count for each specific FILE_OBJECT. +- Have the actual last call to CcUninitializeCacheMap (that is, the one that +causes zero private cache maps to be associated with a shared cache map) to +delete the cache map and flush. + +So private cache map here is a light weight structure that just remembers +what shared cache map it associates with. + + */ typedef struct _NOCC_PRIVATE_CACHE_MAP { LIST_ENTRY ListEntry; @@ -98,6 +137,19 @@ Map->Callbacks.ReleaseFromLazyWrite(Map->LazyContext); }
+/* + +Cc functions are required to treat alternate streams of a file as the same +for the purpose of caching, meaning that we must be able to find the shared +cache map associated with the ``real'' stream associated with a stream file +object, if one exists. We do that by identifying a private cache map in +our gamut that has the same volume, device and fscontext as the stream file +object we're holding. It's heavy but it does work. This can probably be +improved, although there doesn't seem to be any real association between +a stream file object and a sibling file object in the file object struct +itself. + + */ // Must have CcpLock() PFILE_OBJECT CcpFindOtherStreamFileObject(PFILE_OBJECT FileObject) { @@ -141,6 +193,8 @@ PNOCC_PRIVATE_CACHE_MAP PrivateCacheMap = FileObject->PrivateCacheMap;
CcpLock(); + /* We don't have a shared cache map. First find out if we have a sibling + stream file object we can take it from. */ if (!Map && FileObject->Flags & FO_STREAM_FILE) { PFILE_OBJECT IdenticalStreamFileObject = @@ -154,6 +208,7 @@ FileObject, IdenticalStreamFileObject, Map); } } + /* We still don't have a shared cache map. We need to create one. */ if (!Map) { DPRINT("Initializing file object for (%p) %wZ\n", FileObject, &FileObject->FileName); @@ -170,6 +225,9 @@ InsertTailList(&CcpAllSharedCacheMaps, &Map->Entry); DPRINT("New Map %x\n", Map); } + /* We don't have a private cache map. Link it with the shared cache map + to serve as a held reference. When the list in the shared cache map + is empty, we know we can delete it. */ if (!PrivateCacheMap) { PrivateCacheMap = ExAllocatePool(NonPagedPool, sizeof(*PrivateCacheMap)); @@ -183,6 +241,14 @@
CcpUnlock(); } + +/* + +This function is used by NewCC's MM to determine whether any section objects +for a given file are not cache sections. If that's true, we're not allowed +to resize the file, although nothing actually prevents us from doing ;-) + + */
ULONG NTAPI @@ -210,18 +276,25 @@
ASSERT(UninitializeEvent == NULL);
+ /* It may not be strictly necessary to flush here, but we do just for + kicks. */ if (Map) CcpFlushCache(Map, NULL, 0, NULL, FALSE);
CcpLock(); + /* We have a private cache map, so we've been initialized and haven't been + * uninitialized. */ if (PrivateCacheMap) { ASSERT(!Map || Map == PrivateCacheMap->Map); ASSERT(PrivateCacheMap->FileObject == FileObject);
RemoveEntryList(&PrivateCacheMap->ListEntry); + /* That was the last private cache map. It's time to delete all + cache stripes and all aspects of caching on the file. */ if (IsListEmpty(&PrivateCacheMap->Map->PrivateCacheMaps)) { + /* Get rid of all the cache stripes. */ while (!IsListEmpty(&PrivateCacheMap->Map->AssociatedBcb)) { PNOCC_BCB Bcb = CONTAINING_RECORD(PrivateCacheMap->Map->AssociatedBcb.Flink, NOCC_BCB, ThisFileList); @@ -242,9 +315,19 @@
DPRINT("Uninit complete\n");
+ /* The return from CcUninitializeCacheMap means that 'caching was stopped'. + */ return LastMap; } +/* + +CcSetFileSizes is used to tell the cache manager that the file changed +size. In our case, we use the internal Mm method MmExtendCacheSection +to notify Mm that our section potentially changed size, which may mean +truncating off data. + + */ VOID NTAPI CcSetFileSizes(IN PFILE_OBJECT FileObject, @@ -298,6 +381,13 @@ while (TRUE); } +/* + +This could be implemented much more intelligently by mapping instances +of a CoW zero page into the affected regions. We just RtlZeroMemory +for now. + +*/ BOOLEAN NTAPI CcZeroData(IN PFILE_OBJECT FileObject,
Modified: trunk/reactos/ntoskrnl/cache/pinsup.c URL: http://svn.reactos.org/svn/reactos/trunk/reactos/ntoskrnl/cache/pinsup.c?rev... ============================================================================== --- trunk/reactos/ntoskrnl/cache/pinsup.c [iso-8859-1] (original) +++ trunk/reactos/ntoskrnl/cache/pinsup.c [iso-8859-1] Thu Mar 29 06:01:52 2012 @@ -21,6 +21,73 @@ * This helped me determine that a certain bug was not a memory overwrite. */ //#define PIN_WRITE_ONLY
+/* + +Pinsup implements the core of NewCC. + +A couple of things about this code: + +I wrote this code over the course of about 2 years, often referring to Rajeev +Nagar's Filesystem Internals, book, the msdn pages on the Cc interface, and +a few NT filesystems that are open sourced. I went to fairly great lengths to +achieve a couple of goals. + +1) To make a strictly layered facility that relies entirely on Mm to provide +maps. There were many ways in which data segments in the legacy Mm were unable +to provide what I needed; page maps were only 4 gig, and all offsets were in +ULONG, so no mapping at an offset greater than 4 gig was possible. Worse than +that, due to a convoluted set of dependencies, it would have been impossible to +support any two mappings farther apart than 4 gig, even if the above was +corrected. Along with that, the cache system's ownership of some pages was +integral to the operation of legacy Mm. All of the above problems, along with +an ambiguity about when the size of a file for mapping purposes is acquired, +and its inability to allow a file to be resized when any mappings were active +led me to rewrite data sections (and all other kinds of sections in the +original version), and use that layer to implement the Cc API without regard +to any internal, undocumented parts. + +2) To write the simplest possible code that implements the Cc interface as +documented. Again this is without regard to any information that might be +gained through reverse engineering the real Cc. All conclusions about workings +of Cc here are mine, any failures are mine, any differences to the documented +interface were introduced by me due to misreading, misunderstanding or mis +remembering while implementing the code. I also implemented some obvious, but +not actually specified behaviors of Cc, for example that each cache stripe is +represented by a distinct BCB that the user can make decisions about as an +opaque pointer. + +3) To make real filesystems work properly. + +So about how it works: + +CcCacheSections is the collection of cache sections that are currently mapped. +The cache ranges which are allocated and contain pages is larger, due to the +addition of sections containing rmaps and page references, but this array +determines the actual mapped pages on behalf of all mapped files for Cc's use. +All BCB pointers yielded to a driver are a pointer to one of these cache stripe +structures. The data structure is specified as opaque and so it contains +information convenient to NEWCC's implementation here. Free entries are +summarized in CcpBitmapBuffer, for which bits are set when the entry may be +safely evicted and redirected for use by another client. Note that the +reference count for an evictable cache section will generally be 1, since +we'll keep a reference to wait for any subsequent mapping of the same stripe. +We use CcCacheClockHand as a hint to start checking free bits at a point that +walks around the cache stripe list, so that we might evict a different stripe +every time even if all are awaiting reuse. This is a way to avoid thrashing. + +CcpBitmapBuffer is the RTL_BITMAP that allows us to quickly decide what buffer +to allocate from the mapped buffer set. + +CcDeleteEvent is an event used to wait for a cache stripe reference count to +go to 1, thus making the stripe eligible for eviction. It's used by CcpMapData +to wait for a free map when we can't fail. + +All in all, use of Mm by Cc makes this code into a simple manager that wields +sections on behalf of filesystems. As such, its code is fairly high level and +no architecture specific changes should be necessary. + +*/ + /* GLOBALS ********************************************************************/
#define TAG_MAP_SEC TAG('C', 'c', 'S', 'x') @@ -54,6 +121,14 @@ PDEVICE_OBJECT NTAPI MmGetDeviceObjectForFile(IN PFILE_OBJECT FileObject); + +/* + +Allocate an almost ordinary section object for use by the cache system. +The special internal SEC_CACHE flag is used to indicate that the section +should not count when determining whether the file can be resized. + +*/
NTSTATUS CcpAllocateSection (PFILE_OBJECT FileObject, @@ -94,6 +169,14 @@ BOOLEAN Dirty; } WORK_QUEUE_WITH_CONTEXT, *PWORK_QUEUE_WITH_CONTEXT;
+/* + +Unmap a cache stripe. Note that cache stripes aren't unmapped when their +last reference disappears. We enter this code only if cache for the file +is uninitialized in the last file object, or a cache stripe is evicted. + +*/ + VOID CcpUnmapCache(PVOID Context) { @@ -104,6 +187,20 @@ ExFreePool(WorkItem); DPRINT("Done\n"); } + +/* + +Somewhat deceptively named function which removes the last reference to a +cache stripe and completely removes it using CcUnmapCache. This may be +done either inline (if the Immediate BOOLEAN is set), or using a work item +at a later time. Whether this is called to unmap immeidately is mainly +determined by whether the caller is calling from a place in filesystem code +where a deadlock may occur if immediate flushing is required. + +It's always safe to reuse the Bcb at CcCacheSections[Start] after calling +this. + + */
/* Must have acquired the mutex */ VOID CcpDereferenceCache(ULONG Start, BOOLEAN Immediate) @@ -186,6 +283,18 @@ DPRINT("Done\n"); }
+/* + +CcpAllocateCacheSections is called by CcpMapData to obtain a cache stripe, +possibly evicting an old stripe by calling CcpDereferenceCache in order to +obtain an empty Bcb. + +This function was named plural due to a question I had at the beginning of +this endeavor about whether a map may span a 256k stripe boundary. It can't +so this function can only return the index of one Bcb. Returns INVALID_CACHE +on failure. + + */ /* Needs mutex */ ULONG CcpAllocateCacheSections (PFILE_OBJECT FileObject, @@ -198,12 +307,12 @@ DPRINT("AllocateCacheSections: FileObject %x\n", FileObject); if (!FileObject->SectionObjectPointer) - return INVALID_CACHE; + return INVALID_CACHE;
Map = (PNOCC_CACHE_MAP)FileObject->SectionObjectPointer->SharedCacheMap;
if (!Map) - return INVALID_CACHE; + return INVALID_CACHE;
DPRINT("Allocating Cache Section\n");
@@ -212,34 +321,34 @@
if (i != INVALID_CACHE) { - DPRINT("Setting up Bcb #%x\n", i); - - Bcb = &CcCacheSections[i]; + DPRINT("Setting up Bcb #%x\n", i); + + Bcb = &CcCacheSections[i]; - ASSERT(Bcb->RefCount < 2); - - if (Bcb->RefCount > 0) - { - CcpDereferenceCache(i, FALSE); - } - - ASSERT(!Bcb->RefCount); - Bcb->RefCount = 1; - - DPRINT("Bcb #%x RefCount %d\n", Bcb - CcCacheSections, Bcb->RefCount); - - if (!RtlTestBit(CcCacheBitmap, i)) - { - DPRINT1("Somebody stoeled BCB #%x\n", i); - } - ASSERT(RtlTestBit(CcCacheBitmap, i)); - - DPRINT("Allocated #%x\n", i); - ASSERT(CcCacheSections[i].RefCount); + ASSERT(Bcb->RefCount < 2); + + if (Bcb->RefCount > 0) + { + CcpDereferenceCache(i, FALSE); + } + + ASSERT(!Bcb->RefCount); + Bcb->RefCount = 1; + + DPRINT("Bcb #%x RefCount %d\n", Bcb - CcCacheSections, Bcb->RefCount); + + if (!RtlTestBit(CcCacheBitmap, i)) + { + DPRINT1("Somebody stoeled BCB #%x\n", i); + } + ASSERT(RtlTestBit(CcCacheBitmap, i)); + + DPRINT("Allocated #%x\n", i); + ASSERT(CcCacheSections[i].RefCount); } else { - DPRINT1("Failed to allocate cache segment\n"); + DPRINT1("Failed to allocate cache segment\n"); } return i; } @@ -262,6 +371,14 @@ Bcb->ExclusiveWaiter++; }
+/* + +Cache stripes have an idea of exclusive access, which would be hard to support +properly in the previous code. In our case, it's fairly easy, since we have +an event that indicates that the previous exclusive waiter has returned in each +Bcb. + +*/ /* Must not have the mutex */ VOID CcpReferenceCacheExclusive(ULONG Start) { @@ -277,7 +394,19 @@ CcpUnlock(); }
-/* Find a map that encompasses the target range */ +/* + +Find a map that encompasses the target range. This function does not check +whether the desired range is partly outside the stripe. This could be +implemented with a generic table, but we generally aren't carring around a lot +of segments at once for a particular file. + +When this returns a map for a given file address, then that address is by +definition already mapped and can be operated on. + +Returns a valid index or INVALID_CACHE. + +*/ /* Must have the mutex */ ULONG CcpFindMatchingMap(PLIST_ENTRY Head, PLARGE_INTEGER FileOffset, ULONG Length) { @@ -301,6 +430,14 @@
return INVALID_CACHE; } + +/* + +Internal function that's used by all pinning functions. +It causes a mapped region to exist and prefaults the pages in it if possible, +possibly evicting another stripe in order to get our stripe. + +*/
BOOLEAN NTAPI @@ -364,6 +501,11 @@
DPRINT("File size %08x%08x\n", Map->FileSizes.ValidDataLength.HighPart, Map->FileSizes.ValidDataLength.LowPart); + /* Not all files have length, in fact filesystems often use stream file + objects for various internal purposes and are loose about the file + length, since the filesystem promises itself to write the right number + of bytes to the internal stream. In these cases, we just allow the file + to have the full stripe worth of space. */ if (Map->FileSizes.ValidDataLength.QuadPart) { SectionSize = min(CACHE_STRIPE, Map->FileSizes.ValidDataLength.QuadPart - Target.QuadPart); @@ -378,6 +520,8 @@ //ASSERT(SectionSize <= CACHE_STRIPE);
CcpUnlock(); + /* CcpAllocateSection doesn't need the lock, so we'll give other action + a chance in here. */ Status = CcpAllocateSection (FileObject, SectionSize, @@ -399,8 +543,9 @@ retry: /* Returns a reference */ - DPRINT("Allocating cache sections: %wZ\n", &FileObject->FileName); + DPRINT("Allocating cache sections: %wZ\n", &FileObject->FileName); BcbHead = CcpAllocateCacheSections(FileObject, SectionObject); + /* XXX todo: we should handle the immediate fail case here, but don't */ if (BcbHead == INVALID_CACHE) { ULONG i; @@ -429,12 +574,18 @@ ViewSize = CACHE_STRIPE;
Bcb = &CcCacheSections[BcbHead]; + /* MmMapCacheViewInSystemSpaceAtOffset is one of three methods of Mm + that are specific to NewCC. In this case, it's implementation + exactly mirrors MmMapViewInSystemSpace, but allows an offset to + be specified. */ Status = MmMapCacheViewInSystemSpaceAtOffset (SectionObject->Segment, &Bcb->BaseAddress, &Target, &ViewSize); + /* Summary: Failure. Dereference our section and tell the user we failed + */ if (!NT_SUCCESS(Status)) { *BcbResult = NULL; @@ -447,6 +598,9 @@ goto cleanup; } + /* Summary: Success. Put together a valid Bcb and link it with the others + * in the NOCC_CACHE_MAP. + */ Success = TRUE; //DPRINT("w1n\n");
@@ -539,6 +693,8 @@ return Result; }
+/* Used by functions that repin data, CcpPinMappedData does not alter the map, + but finds the appropriate stripe and update the accounting. */ BOOLEAN NTAPI CcpPinMappedData(IN PNOCC_CACHE_MAP Map, @@ -707,6 +863,32 @@
return Result; } + +/* + +CcpUnpinData is the internal function that generally handles unpinning data. +It may be a little confusing, because of the way reference counts are handled. + +A reference count of 2 or greater means that the stripe is still fully pinned +and can't be removed. If the owner had taken an exclusive reference, then +give one up. Note that it's an error to take more than one exclusive reference +or to take a non-exclusive reference after an exclusive reference, so detecting +or handling that case is not considered. + +ReleaseBit is unset if we want to detect when a cache stripe would become +evictable without actually giving up our reference. We might want to do that +if we were going to flush before formally releasing the cache stripe, although +that facility is not used meaningfully at this time. + +A reference count of exactly 1 means that the stripe could potentially be +reused, but could also be evicted for another mapping. In general, most +stripes should be in that state most of the time. + +A reference count of zero means that the Bcb is completely unused. That's the +start state and the state of a Bcb formerly owned by a file that is +uninitialized. + +*/
BOOLEAN NTAPI