H5ACpkg.h

00001 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
00002  * Copyright by The HDF Group.                                               *
00003  * Copyright by the Board of Trustees of the University of Illinois.         *
00004  * All rights reserved.                                                      *
00005  *                                                                           *
00006  * This file is part of HDF5.  The full HDF5 copyright notice, including     *
00007  * terms governing use, modification, and redistribution, is contained in    *
00008  * the files COPYING and Copyright.html.  COPYING can be found at the root   *
00009  * of the source code distribution tree; Copyright.html can be found at the  *
00010  * root level of an installed copy of the electronic HDF5 document set and   *
00011  * is linked from the top-level documents page.  It can also be found at     *
00012  * http://hdfgroup.org/HDF5/doc/Copyright.html.  If you do not have          *
00013  * access to either file, you may request a copy from help@hdfgroup.org.     *
00014  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00015 
00016 /*
00017  * Programmer: John Mainzer -- 4/19/06
00018  *
00019  * Purpose:     This file contains declarations which are normally visible
00020  *              only within the H5AC package (just H5AC.c at present).
00021  *
00022  *              Source files outside the H5AC package should include
00023  *              H5ACprivate.h instead.
00024  *
00025  *              The one exception to this rule is testpar/t_cache.c.  The
00026  *              test code is easier to write if it can look at H5AC_aux_t.
00027  *              Indeed, this is the main reason why this file was created.
00028  */
00029 
00030 #ifndef H5AC_PACKAGE
00031 #error "Do not include this file outside the H5AC package!"
00032 #endif
00033 
00034 #ifndef _H5ACpkg_H
00035 #define _H5ACpkg_H
00036 
00037 /* Get package's private header */
00038 #include "H5ACprivate.h"
00039 
00040 
00041 /* Get needed headers */
00042 #include "H5Cprivate.h"         /* Cache                                */
00043 #include "H5SLprivate.h"        /* Skip lists */
00044 
00045 
00046 #define H5AC_DEBUG_DIRTY_BYTES_CREATION 0
00047 
00048 /*-------------------------------------------------------------------------
00049  *  It is a bit difficult to set ranges of allowable values on the
00050  *  dirty_bytes_threshold field of H5AC_aux_t.  The following are
00051  *  probably broader than they should be.
00052  *-------------------------------------------------------------------------
00053  */
00054 
00055 #define H5AC__MIN_DIRTY_BYTES_THRESHOLD         (int32_t) \
00056                                                 (H5C__MIN_MAX_CACHE_SIZE / 2)
00057 #define H5AC__DEFAULT_DIRTY_BYTES_THRESHOLD     (256 * 1024)
00058 #define H5AC__MAX_DIRTY_BYTES_THRESHOLD         (int32_t) \
00059                                                 (H5C__MAX_MAX_CACHE_SIZE / 4)
00060 
00061 /****************************************************************************
00062  *
00063  * structure H5AC_aux_t
00064  *
00065  * While H5AC has become a wrapper for the cache implemented in H5C.c, there
00066  * are some features of the metadata cache that are specific to it, and which
00067  * therefore do not belong in the more generic H5C cache code.
00068  *
00069  * In particular, there is the matter of synchronizing writes from the
00070  * metadata cache to disk in the PHDF5 case.
00071  *
00072  * Prior to this update, the presumption was that all metadata caches would
00073  * write the same data at the same time since all operations modifying
00074  * metadata must be performed collectively.  Given this assumption, it was
00075  * safe to allow only the writes from process 0 to actually make it to disk,
00076  * while metadata writes from all other processes were discarded.
00077  *
00078  * Unfortunately, this presumption is in error as operations that read
00079  * metadata need not be collective, but can change the location of dirty
00080  * entries in the metadata cache LRU lists.  This can result in the same
00081  * metadata write operation triggering writes from the metadata caches on
00082  * some processes, but not all (causing a hang), or in different sets of
00083  * entries being written from different caches (potentially resulting in
00084  * metadata corruption in the file).
00085  *
00086  * To deal with this issue, I decided to apply a paradigm shift to the way
00087  * metadata is written to disk.
00088  *
00089  * With this set of changes, only the metadata cache on process 0 is able
00090  * to write metadata to disk, although metadata caches on all other
00091  * processes can read metadata from disk as before.
00092  *
00093  * To keep all the other caches from getting plugged up with dirty metadata,
00094  * process 0 periodically broadcasts a list of entries that it has flushed
00095  * since that last notice, and which are currently clean.  The other caches
00096  * mark these entries as clean as well, which allows them to evict the
00097  * entries as needed.
00098  *
00099  * One obvious problem in this approach is synchronizing the broadcasts
00100  * and receptions, as different caches may see different amounts of
00101  * activity.
00102  *
00103  * The current solution is for the caches to track the number of bytes
00104  * of newly generated dirty metadata, and to broadcast and receive
00105  * whenever this value exceeds some user specified threshold.
00106  *
00107  * Maintaining this count is easy for all processes not on process 0 --
00108  * all that is necessary is to add the size of the entry to the total
00109  * whenever there is an insertion, a rename of a previously clean entry,
00110  * or whever a previously clean entry is marked dirty in an unprotect.
00111  *
00112  * On process 0, we have to be careful not to count dirty bytes twice.
00113  * If an entry is marked dirty, flushed, and marked dirty again, all
00114  * within a single reporting period, it only th first marking should
00115  * be added to the dirty bytes generated tally, as that is all that
00116  * the other processes will see.
00117  *
00118  * At present, this structure exists to maintain the fields needed to
00119  * implement the above scheme, and thus is only used in the parallel
00120  * case.  However, other uses may arise in the future.
00121  *
00122  * Instance of this structure are associated with metadata caches via
00123  * the aux_ptr field of H5C_t (see H5Cpkg.h).  The H5AC code is
00124  * responsible for allocating, maintaining, and discarding instances
00125  * of H5AC_aux_t.
00126  *
00127  * The remainder of this header comments documents the individual fields
00128  * of the structure.
00129  *
00130  *                                              JRM - 6/27/05
00131  *
00132  * magic:       Unsigned 32 bit integer always set to
00133  *              H5AC__H5AC_AUX_T_MAGIC.  This field is used to validate
00134  *              pointers to instances of H5AC_aux_t.
00135  *
00136  * mpi_comm:    MPI communicator associated with the file for which the
00137  *              cache has been created.
00138  *
00139  * mpi_rank:    MPI rank of this process within mpi_comm.
00140  *
00141  * mpi_size:    Number of processes in mpi_comm.
00142  *
00143  * write_permitted:  Boolean flag used to control whether the cache
00144  *              is permitted to write to file.
00145  *
00146  * dirty_bytes_threshold: Integer field containing the dirty bytes
00147  *              generation threashold.  Whenever dirty byte creation
00148  *              exceeds this value, the metadata cache on process 0
00149  *              broadcasts a list of the entries it has flushed since
00150  *              the last broadcast (or since the beginning of execution)
00151  *              and which are currently clean (if they are still in the
00152  *              cache)
00153  *
00154  *              Similarly, metadata caches on processes other than process
00155  *              0 will attempt to receive a list of clean entries whenever
00156  *              the threshold is exceeded.
00157  *
00158  * dirty_bytes:  Integer field containing the number of bytes of dirty
00159  *              metadata generated since the beginning of the computation,
00160  *              or (more typically) since the last clean entries list
00161  *              broadcast.  This field is reset to zero after each such
00162  *              broadcast.
00163  *
00164  * dirty_bytes_propagations: This field only exists when the
00165  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
00166  *
00167  *              It is used to track the number of times the cleaned list
00168  *              has been propagated from process 0 to the other
00169  *              processes.
00170  *
00171  * unprotect_dirty_bytes:  This field only exists when the
00172  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
00173  *
00174  *              It is used to track the number of dirty bytes created
00175  *              via unprotect operations since the last time the cleaned
00176  *              list was propagated.
00177  *
00178  * unprotect_dirty_bytes_updates: This field only exists when the
00179  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
00180  *
00181  *              It is used to track the number of times dirty bytes have
00182  *              been created via unprotect operations since the last time
00183  *              the cleaned list was propagated.
00184  *
00185  * insert_dirty_bytes:  This field only exists when the
00186  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
00187  *
00188  *              It is used to track the number of dirty bytes created
00189  *              via insert operations since the last time the cleaned
00190  *              list was propagated.
00191  *
00192  * insert_dirty_bytes_updates:  This field only exists when the
00193  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
00194  *
00195  *              It is used to track the number of times dirty bytes have
00196  *              been created via insert operations since the last time
00197  *              the cleaned list was propagated.
00198  *
00199  * rename_dirty_bytes:  This field only exists when the
00200  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
00201  *
00202  *              It is used to track the number of dirty bytes created
00203  *              via rename operations since the last time the cleaned
00204  *              list was propagated.
00205  *
00206  * rename_dirty_bytes_updates:  This field only exists when the
00207  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
00208  *
00209  *              It is used to track the number of times dirty bytes have
00210  *              been created via rename operations since the last time
00211  *              the cleaned list was propagated.
00212  *
00213  * d_slist_ptr:  Pointer to an instance of H5SL_t used to maintain a list
00214  *              of entries that have been dirtied since the last time they
00215  *              were listed in a clean entries broadcast.  This list is
00216  *              only maintained by the metadata cache on process 0 -- it
00217  *              it used to maintain a view of the dirty entries as seen
00218  *              by the other caches, so as to keep the dirty bytes count
00219  *              in synchronization with them.
00220  *
00221  *              Thus on process 0, the dirty_bytes count is incremented
00222  *              only if either
00223  *
00224  *              1) an entry is inserted in the metadata cache, or
00225  *
00226  *              2) a previously clean entry is renamed, and it does not
00227  *                 already appear in the dirty entry list, or
00228  *
00229  *              3) a previously clean entry is unprotected with the
00230  *                 dirtied flag set and the entry does not already appear
00231  *                 in the dirty entry list.
00232  *
00233  *              Entries are added to the dirty entry list whever they cause
00234  *              the dirty bytes count to be increased.  They are removed
00235  *              when they appear in a clean entries broadcast.  Note that
00236  *              renames must be reflected in the dirty entry list.
00237  *
00238  *              To reitterate, this field is only used on process 0 -- it
00239  *              should be NULL on all other processes.
00240  *
00241  * d_slist_len: Integer field containing the number of entries in the
00242  *              dirty entry list.  This field should always contain the
00243  *              value 0 on all processes other than process 0.  It exists
00244  *              primarily for sanity checking.
00245  *
00246  * c_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list
00247  *              of entries that were dirty, have been flushed
00248  *              to disk since the last clean entries broadcast, and are
00249  *              still clean.  Since only process 0 can write to disk, this
00250  *              list only exists on process 0.
00251  *
00252  *              In essence, this slist is used to assemble the contents of
00253  *              the next clean entries broadcast.  The list emptied after
00254  *              each broadcast.
00255  *
00256  * c_slist_len: Integer field containing the number of entries in the clean
00257  *              entries list (*c_slist_ptr).  This field should always
00258  *              contain the value 0 on all processes other than process 0.
00259  *              It exists primarily for sanity checking.
00260  *
00261  * write_done:  In the parallel test bed, it is necessary to ensure that
00262  *              all writes to the server process from cache 0 complete
00263  *              before it enters the barrier call with the other caches.
00264  *
00265  *              The write_done callback allows t_cache to do this without
00266  *              requiring an ACK on each write.  Since these ACKs greatly
00267  *              increase the run time on some platforms, this is a
00268  *              significant optimization.
00269  *
00270  *              This field must be set to NULL when the callback is not
00271  *              needed.
00272  *
00273  ****************************************************************************/
00274 
00275 #ifdef H5_HAVE_PARALLEL
00276 
00277 #define H5AC__H5AC_AUX_T_MAGIC        (unsigned)0x00D0A01
00278 
00279 typedef struct H5AC_aux_t
00280 {
00281     uint32_t    magic;
00282 
00283     MPI_Comm    mpi_comm;
00284 
00285     int         mpi_rank;
00286 
00287     int         mpi_size;
00288 
00289     hbool_t     write_permitted;
00290 
00291     int32_t     dirty_bytes_threshold;
00292 
00293     int32_t     dirty_bytes;
00294 
00295 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
00296 
00297     int32_t     dirty_bytes_propagations;
00298 
00299     int32_t     unprotect_dirty_bytes;
00300     int32_t     unprotect_dirty_bytes_updates;
00301 
00302     int32_t     insert_dirty_bytes;
00303     int32_t     insert_dirty_bytes_updates;
00304 
00305     int32_t     rename_dirty_bytes;
00306     int32_t     rename_dirty_bytes_updates;
00307 
00308 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
00309 
00310     H5SL_t *    d_slist_ptr;
00311 
00312     int32_t     d_slist_len;
00313 
00314     H5SL_t *    c_slist_ptr;
00315 
00316     int32_t     c_slist_len;
00317 
00318     void        (* write_done)(void);
00319 
00320 } H5AC_aux_t; /* struct H5AC_aux_t */
00321 
00322 #endif /* H5_HAVE_PARALLEL */
00323 
00324 #endif /* _H5ACpkg_H */
00325