Author: arty
Date: Thu Mar 29 06:01:52 2012
New Revision: 56268
URL:
http://svn.reactos.org/svn/reactos?rev=56268&view=rev
Log:
[NEWCC]
Add some prose describing this functionality.
Dedicated to timo, chongo, goto and ??=
Just formatting and comments.
Modified:
trunk/reactos/ntoskrnl/cache/copysup.c
trunk/reactos/ntoskrnl/cache/fssup.c
trunk/reactos/ntoskrnl/cache/pinsup.c
Modified: trunk/reactos/ntoskrnl/cache/copysup.c
URL:
http://svn.reactos.org/svn/reactos/trunk/reactos/ntoskrnl/cache/copysup.c?r…
==============================================================================
--- trunk/reactos/ntoskrnl/cache/copysup.c [iso-8859-1] (original)
+++ trunk/reactos/ntoskrnl/cache/copysup.c [iso-8859-1] Thu Mar 29 06:01:52 2012
@@ -28,6 +28,16 @@
/* FUNCTIONS ******************************************************************/
+/*
+
+CcCopyRead can be called for a region of any size and alignment, so we must
+crawl the cache space, focusing one cache stripe after another and using
+RtlCopyMemory to copy the input data into the cache. In constrained memory,
+pages faulted into new stripes are often taken from old stripes, causing the
+old stripes to be flushed right away. In the case of many short buffered in
+order writes, like the ones generated by stdio, this can be really efficient.
+
+*/
BOOLEAN
NTAPI
CcCopyRead(IN PFILE_OBJECT FileObject,
Modified: trunk/reactos/ntoskrnl/cache/fssup.c
URL:
http://svn.reactos.org/svn/reactos/trunk/reactos/ntoskrnl/cache/fssup.c?rev…
==============================================================================
--- trunk/reactos/ntoskrnl/cache/fssup.c [iso-8859-1] (original)
+++ trunk/reactos/ntoskrnl/cache/fssup.c [iso-8859-1] Thu Mar 29 06:01:52 2012
@@ -27,6 +27,45 @@
CLIENT_ID CcUnmapThreadId, CcLazyWriteThreadId;
FAST_MUTEX GlobalPageOperation;
+/*
+
+A note about private cache maps.
+
+CcInitializeCacheMap and CcUninitializeCacheMap are not meant to be paired,
+although they can work that way.
+
+The actual operation I've gleaned from reading both jan kratchovil's writing
+and real filesystems is this:
+
+CcInitializeCacheMap means:
+
+Make the indicated FILE_OBJECT have a private cache map if it doesn't already
+and make it have a shared cache map if it doesn't already.
+
+CcUninitializeCacheMap means:
+
+Take away the private cache map from this FILE_OBJECT. If it's the last
+private cache map corresponding to a specific shared cache map (the one that
+was present in the FILE_OBJECT when it was created), then delete that too,
+flusing all cached information.
+
+Using these simple semantics, filesystems can do all the things they actually
+do:
+
+- Copy out the shared cache map pointer from a newly initialized file object
+and store it in the fcb cache.
+- Copy it back into any file object and call CcInitializeCacheMap to make
+that file object be associated with the caching of all the other siblings.
+- Call CcUninitializeCacheMap on a FILE_OBJECT many times, but have only the
+first one count for each specific FILE_OBJECT.
+- Have the actual last call to CcUninitializeCacheMap (that is, the one that
+causes zero private cache maps to be associated with a shared cache map) to
+delete the cache map and flush.
+
+So private cache map here is a light weight structure that just remembers
+what shared cache map it associates with.
+
+ */
typedef struct _NOCC_PRIVATE_CACHE_MAP
{
LIST_ENTRY ListEntry;
@@ -98,6 +137,19 @@
Map->Callbacks.ReleaseFromLazyWrite(Map->LazyContext);
}
+/*
+
+Cc functions are required to treat alternate streams of a file as the same
+for the purpose of caching, meaning that we must be able to find the shared
+cache map associated with the ``real'' stream associated with a stream file
+object, if one exists. We do that by identifying a private cache map in
+our gamut that has the same volume, device and fscontext as the stream file
+object we're holding. It's heavy but it does work. This can probably be
+improved, although there doesn't seem to be any real association between
+a stream file object and a sibling file object in the file object struct
+itself.
+
+ */
// Must have CcpLock()
PFILE_OBJECT CcpFindOtherStreamFileObject(PFILE_OBJECT FileObject)
{
@@ -141,6 +193,8 @@
PNOCC_PRIVATE_CACHE_MAP PrivateCacheMap = FileObject->PrivateCacheMap;
CcpLock();
+ /* We don't have a shared cache map. First find out if we have a sibling
+ stream file object we can take it from. */
if (!Map && FileObject->Flags & FO_STREAM_FILE)
{
PFILE_OBJECT IdenticalStreamFileObject =
@@ -154,6 +208,7 @@
FileObject, IdenticalStreamFileObject, Map);
}
}
+ /* We still don't have a shared cache map. We need to create one. */
if (!Map)
{
DPRINT("Initializing file object for (%p) %wZ\n", FileObject,
&FileObject->FileName);
@@ -170,6 +225,9 @@
InsertTailList(&CcpAllSharedCacheMaps, &Map->Entry);
DPRINT("New Map %x\n", Map);
}
+ /* We don't have a private cache map. Link it with the shared cache map
+ to serve as a held reference. When the list in the shared cache map
+ is empty, we know we can delete it. */
if (!PrivateCacheMap)
{
PrivateCacheMap = ExAllocatePool(NonPagedPool, sizeof(*PrivateCacheMap));
@@ -183,6 +241,14 @@
CcpUnlock();
}
+
+/*
+
+This function is used by NewCC's MM to determine whether any section objects
+for a given file are not cache sections. If that's true, we're not allowed
+to resize the file, although nothing actually prevents us from doing ;-)
+
+ */
ULONG
NTAPI
@@ -210,18 +276,25 @@
ASSERT(UninitializeEvent == NULL);
+ /* It may not be strictly necessary to flush here, but we do just for
+ kicks. */
if (Map)
CcpFlushCache(Map, NULL, 0, NULL, FALSE);
CcpLock();
+ /* We have a private cache map, so we've been initialized and haven't been
+ * uninitialized. */
if (PrivateCacheMap)
{
ASSERT(!Map || Map == PrivateCacheMap->Map);
ASSERT(PrivateCacheMap->FileObject == FileObject);
RemoveEntryList(&PrivateCacheMap->ListEntry);
+ /* That was the last private cache map. It's time to delete all
+ cache stripes and all aspects of caching on the file. */
if (IsListEmpty(&PrivateCacheMap->Map->PrivateCacheMaps))
{
+ /* Get rid of all the cache stripes. */
while (!IsListEmpty(&PrivateCacheMap->Map->AssociatedBcb))
{
PNOCC_BCB Bcb = CONTAINING_RECORD(PrivateCacheMap->Map->AssociatedBcb.Flink,
NOCC_BCB, ThisFileList);
@@ -242,9 +315,19 @@
DPRINT("Uninit complete\n");
+ /* The return from CcUninitializeCacheMap means that 'caching was stopped'.
+ */
return LastMap;
}
+/*
+
+CcSetFileSizes is used to tell the cache manager that the file changed
+size. In our case, we use the internal Mm method MmExtendCacheSection
+to notify Mm that our section potentially changed size, which may mean
+truncating off data.
+
+ */
VOID
NTAPI
CcSetFileSizes(IN PFILE_OBJECT FileObject,
@@ -298,6 +381,13 @@
while (TRUE);
}
+/*
+
+This could be implemented much more intelligently by mapping instances
+of a CoW zero page into the affected regions. We just RtlZeroMemory
+for now.
+
+*/
BOOLEAN
NTAPI
CcZeroData(IN PFILE_OBJECT FileObject,
Modified: trunk/reactos/ntoskrnl/cache/pinsup.c
URL:
http://svn.reactos.org/svn/reactos/trunk/reactos/ntoskrnl/cache/pinsup.c?re…
==============================================================================
--- trunk/reactos/ntoskrnl/cache/pinsup.c [iso-8859-1] (original)
+++ trunk/reactos/ntoskrnl/cache/pinsup.c [iso-8859-1] Thu Mar 29 06:01:52 2012
@@ -21,6 +21,73 @@
* This helped me determine that a certain bug was not a memory overwrite. */
//#define PIN_WRITE_ONLY
+/*
+
+Pinsup implements the core of NewCC.
+
+A couple of things about this code:
+
+I wrote this code over the course of about 2 years, often referring to Rajeev
+Nagar's Filesystem Internals, book, the msdn pages on the Cc interface, and
+a few NT filesystems that are open sourced. I went to fairly great lengths to
+achieve a couple of goals.
+
+1) To make a strictly layered facility that relies entirely on Mm to provide
+maps. There were many ways in which data segments in the legacy Mm were unable
+to provide what I needed; page maps were only 4 gig, and all offsets were in
+ULONG, so no mapping at an offset greater than 4 gig was possible. Worse than
+that, due to a convoluted set of dependencies, it would have been impossible to
+support any two mappings farther apart than 4 gig, even if the above was
+corrected. Along with that, the cache system's ownership of some pages was
+integral to the operation of legacy Mm. All of the above problems, along with
+an ambiguity about when the size of a file for mapping purposes is acquired,
+and its inability to allow a file to be resized when any mappings were active
+led me to rewrite data sections (and all other kinds of sections in the
+original version), and use that layer to implement the Cc API without regard
+to any internal, undocumented parts.
+
+2) To write the simplest possible code that implements the Cc interface as
+documented. Again this is without regard to any information that might be
+gained through reverse engineering the real Cc. All conclusions about workings
+of Cc here are mine, any failures are mine, any differences to the documented
+interface were introduced by me due to misreading, misunderstanding or mis
+remembering while implementing the code. I also implemented some obvious, but
+not actually specified behaviors of Cc, for example that each cache stripe is
+represented by a distinct BCB that the user can make decisions about as an
+opaque pointer.
+
+3) To make real filesystems work properly.
+
+So about how it works:
+
+CcCacheSections is the collection of cache sections that are currently mapped.
+The cache ranges which are allocated and contain pages is larger, due to the
+addition of sections containing rmaps and page references, but this array
+determines the actual mapped pages on behalf of all mapped files for Cc's use.
+All BCB pointers yielded to a driver are a pointer to one of these cache stripe
+structures. The data structure is specified as opaque and so it contains
+information convenient to NEWCC's implementation here. Free entries are
+summarized in CcpBitmapBuffer, for which bits are set when the entry may be
+safely evicted and redirected for use by another client. Note that the
+reference count for an evictable cache section will generally be 1, since
+we'll keep a reference to wait for any subsequent mapping of the same stripe.
+We use CcCacheClockHand as a hint to start checking free bits at a point that
+walks around the cache stripe list, so that we might evict a different stripe
+every time even if all are awaiting reuse. This is a way to avoid thrashing.
+
+CcpBitmapBuffer is the RTL_BITMAP that allows us to quickly decide what buffer
+to allocate from the mapped buffer set.
+
+CcDeleteEvent is an event used to wait for a cache stripe reference count to
+go to 1, thus making the stripe eligible for eviction. It's used by CcpMapData
+to wait for a free map when we can't fail.
+
+All in all, use of Mm by Cc makes this code into a simple manager that wields
+sections on behalf of filesystems. As such, its code is fairly high level and
+no architecture specific changes should be necessary.
+
+*/
+
/* GLOBALS ********************************************************************/
#define TAG_MAP_SEC TAG('C', 'c', 'S', 'x')
@@ -54,6 +121,14 @@
PDEVICE_OBJECT
NTAPI
MmGetDeviceObjectForFile(IN PFILE_OBJECT FileObject);
+
+/*
+
+Allocate an almost ordinary section object for use by the cache system.
+The special internal SEC_CACHE flag is used to indicate that the section
+should not count when determining whether the file can be resized.
+
+*/
NTSTATUS CcpAllocateSection
(PFILE_OBJECT FileObject,
@@ -94,6 +169,14 @@
BOOLEAN Dirty;
} WORK_QUEUE_WITH_CONTEXT, *PWORK_QUEUE_WITH_CONTEXT;
+/*
+
+Unmap a cache stripe. Note that cache stripes aren't unmapped when their
+last reference disappears. We enter this code only if cache for the file
+is uninitialized in the last file object, or a cache stripe is evicted.
+
+*/
+
VOID
CcpUnmapCache(PVOID Context)
{
@@ -104,6 +187,20 @@
ExFreePool(WorkItem);
DPRINT("Done\n");
}
+
+/*
+
+Somewhat deceptively named function which removes the last reference to a
+cache stripe and completely removes it using CcUnmapCache. This may be
+done either inline (if the Immediate BOOLEAN is set), or using a work item
+at a later time. Whether this is called to unmap immeidately is mainly
+determined by whether the caller is calling from a place in filesystem code
+where a deadlock may occur if immediate flushing is required.
+
+It's always safe to reuse the Bcb at CcCacheSections[Start] after calling
+this.
+
+ */
/* Must have acquired the mutex */
VOID CcpDereferenceCache(ULONG Start, BOOLEAN Immediate)
@@ -186,6 +283,18 @@
DPRINT("Done\n");
}
+/*
+
+CcpAllocateCacheSections is called by CcpMapData to obtain a cache stripe,
+possibly evicting an old stripe by calling CcpDereferenceCache in order to
+obtain an empty Bcb.
+
+This function was named plural due to a question I had at the beginning of
+this endeavor about whether a map may span a 256k stripe boundary. It can't
+so this function can only return the index of one Bcb. Returns INVALID_CACHE
+on failure.
+
+ */
/* Needs mutex */
ULONG CcpAllocateCacheSections
(PFILE_OBJECT FileObject,
@@ -198,12 +307,12 @@
DPRINT("AllocateCacheSections: FileObject %x\n", FileObject);
if (!FileObject->SectionObjectPointer)
- return INVALID_CACHE;
+ return INVALID_CACHE;
Map = (PNOCC_CACHE_MAP)FileObject->SectionObjectPointer->SharedCacheMap;
if (!Map)
- return INVALID_CACHE;
+ return INVALID_CACHE;
DPRINT("Allocating Cache Section\n");
@@ -212,34 +321,34 @@
if (i != INVALID_CACHE)
{
- DPRINT("Setting up Bcb #%x\n", i);
-
- Bcb = &CcCacheSections[i];
+ DPRINT("Setting up Bcb #%x\n", i);
+
+ Bcb = &CcCacheSections[i];
- ASSERT(Bcb->RefCount < 2);
-
- if (Bcb->RefCount > 0)
- {
- CcpDereferenceCache(i, FALSE);
- }
-
- ASSERT(!Bcb->RefCount);
- Bcb->RefCount = 1;
-
- DPRINT("Bcb #%x RefCount %d\n", Bcb - CcCacheSections, Bcb->RefCount);
-
- if (!RtlTestBit(CcCacheBitmap, i))
- {
- DPRINT1("Somebody stoeled BCB #%x\n", i);
- }
- ASSERT(RtlTestBit(CcCacheBitmap, i));
-
- DPRINT("Allocated #%x\n", i);
- ASSERT(CcCacheSections[i].RefCount);
+ ASSERT(Bcb->RefCount < 2);
+
+ if (Bcb->RefCount > 0)
+ {
+ CcpDereferenceCache(i, FALSE);
+ }
+
+ ASSERT(!Bcb->RefCount);
+ Bcb->RefCount = 1;
+
+ DPRINT("Bcb #%x RefCount %d\n", Bcb - CcCacheSections,
Bcb->RefCount);
+
+ if (!RtlTestBit(CcCacheBitmap, i))
+ {
+ DPRINT1("Somebody stoeled BCB #%x\n", i);
+ }
+ ASSERT(RtlTestBit(CcCacheBitmap, i));
+
+ DPRINT("Allocated #%x\n", i);
+ ASSERT(CcCacheSections[i].RefCount);
}
else
{
- DPRINT1("Failed to allocate cache segment\n");
+ DPRINT1("Failed to allocate cache segment\n");
}
return i;
}
@@ -262,6 +371,14 @@
Bcb->ExclusiveWaiter++;
}
+/*
+
+Cache stripes have an idea of exclusive access, which would be hard to support
+properly in the previous code. In our case, it's fairly easy, since we have
+an event that indicates that the previous exclusive waiter has returned in each
+Bcb.
+
+*/
/* Must not have the mutex */
VOID CcpReferenceCacheExclusive(ULONG Start)
{
@@ -277,7 +394,19 @@
CcpUnlock();
}
-/* Find a map that encompasses the target range */
+/*
+
+Find a map that encompasses the target range. This function does not check
+whether the desired range is partly outside the stripe. This could be
+implemented with a generic table, but we generally aren't carring around a lot
+of segments at once for a particular file.
+
+When this returns a map for a given file address, then that address is by
+definition already mapped and can be operated on.
+
+Returns a valid index or INVALID_CACHE.
+
+*/
/* Must have the mutex */
ULONG CcpFindMatchingMap(PLIST_ENTRY Head, PLARGE_INTEGER FileOffset, ULONG Length)
{
@@ -301,6 +430,14 @@
return INVALID_CACHE;
}
+
+/*
+
+Internal function that's used by all pinning functions.
+It causes a mapped region to exist and prefaults the pages in it if possible,
+possibly evicting another stripe in order to get our stripe.
+
+*/
BOOLEAN
NTAPI
@@ -364,6 +501,11 @@
DPRINT("File size %08x%08x\n", Map->FileSizes.ValidDataLength.HighPart,
Map->FileSizes.ValidDataLength.LowPart);
+ /* Not all files have length, in fact filesystems often use stream file
+ objects for various internal purposes and are loose about the file
+ length, since the filesystem promises itself to write the right number
+ of bytes to the internal stream. In these cases, we just allow the file
+ to have the full stripe worth of space. */
if (Map->FileSizes.ValidDataLength.QuadPart)
{
SectionSize = min(CACHE_STRIPE, Map->FileSizes.ValidDataLength.QuadPart -
Target.QuadPart);
@@ -378,6 +520,8 @@
//ASSERT(SectionSize <= CACHE_STRIPE);
CcpUnlock();
+ /* CcpAllocateSection doesn't need the lock, so we'll give other action
+ a chance in here. */
Status = CcpAllocateSection
(FileObject,
SectionSize,
@@ -399,8 +543,9 @@
retry:
/* Returns a reference */
- DPRINT("Allocating cache sections: %wZ\n", &FileObject->FileName);
+ DPRINT("Allocating cache sections: %wZ\n", &FileObject->FileName);
BcbHead = CcpAllocateCacheSections(FileObject, SectionObject);
+ /* XXX todo: we should handle the immediate fail case here, but don't */
if (BcbHead == INVALID_CACHE)
{
ULONG i;
@@ -429,12 +574,18 @@
ViewSize = CACHE_STRIPE;
Bcb = &CcCacheSections[BcbHead];
+ /* MmMapCacheViewInSystemSpaceAtOffset is one of three methods of Mm
+ that are specific to NewCC. In this case, it's implementation
+ exactly mirrors MmMapViewInSystemSpace, but allows an offset to
+ be specified. */
Status = MmMapCacheViewInSystemSpaceAtOffset
(SectionObject->Segment,
&Bcb->BaseAddress,
&Target,
&ViewSize);
+ /* Summary: Failure. Dereference our section and tell the user we failed
+ */
if (!NT_SUCCESS(Status))
{
*BcbResult = NULL;
@@ -447,6 +598,9 @@
goto cleanup;
}
+ /* Summary: Success. Put together a valid Bcb and link it with the others
+ * in the NOCC_CACHE_MAP.
+ */
Success = TRUE;
//DPRINT("w1n\n");
@@ -539,6 +693,8 @@
return Result;
}
+/* Used by functions that repin data, CcpPinMappedData does not alter the map,
+ but finds the appropriate stripe and update the accounting. */
BOOLEAN
NTAPI
CcpPinMappedData(IN PNOCC_CACHE_MAP Map,
@@ -707,6 +863,32 @@
return Result;
}
+
+/*
+
+CcpUnpinData is the internal function that generally handles unpinning data.
+It may be a little confusing, because of the way reference counts are handled.
+
+A reference count of 2 or greater means that the stripe is still fully pinned
+and can't be removed. If the owner had taken an exclusive reference, then
+give one up. Note that it's an error to take more than one exclusive reference
+or to take a non-exclusive reference after an exclusive reference, so detecting
+or handling that case is not considered.
+
+ReleaseBit is unset if we want to detect when a cache stripe would become
+evictable without actually giving up our reference. We might want to do that
+if we were going to flush before formally releasing the cache stripe, although
+that facility is not used meaningfully at this time.
+
+A reference count of exactly 1 means that the stripe could potentially be
+reused, but could also be evicted for another mapping. In general, most
+stripes should be in that state most of the time.
+
+A reference count of zero means that the Bcb is completely unused. That's the
+start state and the state of a Bcb formerly owned by a file that is
+uninitialized.
+
+*/
BOOLEAN
NTAPI