Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions lib/dictBuilder/cover.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@
/*-*************************************
* Constants
***************************************/
/**
* There are 32bit indexes used to ref samples, so limit samples size to 4GB
* on 64bit builds.
* For 32bit builds we choose 1 GB.
* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
* contiguous buffer, so 1GB is already a high limit.
*/
#define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
#define COVER_DEFAULT_SPLITPOINT 1.0

Expand Down
7 changes: 7 additions & 0 deletions lib/dictBuilder/fastcover.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@
/*-*************************************
* Constants
***************************************/
/**
* There are 32bit indexes used to ref samples, so limit samples size to 4GB
* on 64bit builds.
* For 32bit builds we choose 1 GB.
* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
* contiguous buffer, so 1GB is already a high limit.
*/
#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
#define FASTCOVER_MAX_F 31
#define FASTCOVER_MAX_ACCEL 10
Expand Down
191 changes: 134 additions & 57 deletions programs/dibio.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));

#define NOISELENGTH 32
#define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */


/*-*************************************
Expand Down Expand Up @@ -88,6 +89,18 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
#undef MIN
#define MIN(a,b) ((a) < (b) ? (a) : (b))

/**
Returns the size of a file.
If error returns 0. Zero filesize or error is same for us.
Emit warning when the file is inaccessible or zero size.
*/
static size_t DiB_getFileSize (const char * fileName)
{
size_t const fileSize = UTIL_getFileSize(fileName);
return (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
}



/* ********************************************************
* File related operations
Expand All @@ -102,46 +115,74 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
* sampleSizes is filled with the size of each sample.
*/
static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr,
size_t* sampleSizes, unsigned sstSize,
const char** fileNamesTable, unsigned nbFiles, size_t targetChunkSize,
unsigned displayLevel)
size_t* sampleSizes, int sstSize,
const char** fileNamesTable, int nbFiles,
size_t targetChunkSize, int displayLevel )
{
char* const buff = (char*)buffer;
size_t pos = 0;
unsigned nbLoadedChunks = 0, fileIndex;

for (fileIndex=0; fileIndex<nbFiles; fileIndex++) {
const char* const fileName = fileNamesTable[fileIndex];
unsigned long long const fs64 = UTIL_getFileSize(fileName);
unsigned long long remainingToLoad = (fs64 == UTIL_FILESIZE_UNKNOWN) ? 0 : fs64;
U32 const nbChunks = targetChunkSize ? (U32)((fs64 + (targetChunkSize-1)) / targetChunkSize) : 1;
U64 const chunkSize = targetChunkSize ? MIN(targetChunkSize, fs64) : fs64;
size_t const maxChunkSize = (size_t)MIN(chunkSize, SAMPLESIZE_MAX);
U32 cnb;
FILE* const f = fopen(fileName, "rb");
if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno));
DISPLAYUPDATE(2, "Loading %s... \r", fileName);
for (cnb=0; cnb<nbChunks; cnb++) {
size_t const toLoad = (size_t)MIN(maxChunkSize, remainingToLoad);
if (toLoad > *bufferSizePtr-pos) break;
{ size_t const readSize = fread(buff+pos, 1, toLoad, f);
if (readSize != toLoad) EXM_THROW(11, "Pb reading %s", fileName);
pos += readSize;
sampleSizes[nbLoadedChunks++] = toLoad;
remainingToLoad -= targetChunkSize;
if (nbLoadedChunks == sstSize) { /* no more space left in sampleSizes table */
fileIndex = nbFiles; /* stop there */
break;
char * buff = (char*)buffer;
size_t totalDataLoaded = 0;
int nbSamplesLoaded = 0;
int fileIndex = 0;
FILE * f = NULL;

assert(targetChunkSize <= SAMPLESIZE_MAX);

while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
size_t const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
if (fileSize == 0)
continue;

f = fopen( fileNamesTable[fileIndex], "rb");
if (f == NULL)
EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]);

/* Load the first chunk of data from the file */
size_t fileDataLoaded = 0;
{
size_t const headSize = targetChunkSize > 0 ?
MIN(fileSize, targetChunkSize) :
MIN(fileSize, SAMPLESIZE_MAX );
if (totalDataLoaded + headSize > *bufferSizePtr)
break;

fileDataLoaded = fread( buff+totalDataLoaded, 1, headSize, f );
if (fileDataLoaded != headSize)
EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
}
sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
totalDataLoaded += fileDataLoaded;

/* If file-chunking is enabled, load the rest of the file as more samples */
if (targetChunkSize > 0) {
while( fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
size_t chunkDataLoaded = 0;
{
size_t const chunkSize = MIN(fileSize-fileDataLoaded, targetChunkSize);
if (chunkSize == 0) /* no more to read */
break;
if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
break;

chunkDataLoaded = fread( buff+totalDataLoaded, 1, chunkSize, f );
if (chunkDataLoaded != chunkSize)
EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
}
if (toLoad < targetChunkSize) {
fseek(f, (long)(targetChunkSize - toLoad), SEEK_CUR);
} } }
fclose(f);
sampleSizes[nbSamplesLoaded++] = chunkDataLoaded;
totalDataLoaded += chunkDataLoaded;
fileDataLoaded += chunkDataLoaded;
}
}
fileIndex += 1;
fclose(f); f = NULL;
}
if (f != NULL)
fclose(f);

DISPLAYLEVEL(2, "\r%79s\r", "");
*bufferSizePtr = pos;
DISPLAYLEVEL(4, "loaded : %u KB \n", (unsigned)(pos >> 10))
return nbLoadedChunks;
DISPLAYLEVEL(4, "loaded : %u KB \n", (unsigned)(totalDataLoaded >> 10))
*bufferSizePtr = totalDataLoaded;
return nbSamplesLoaded;
}

#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
Expand Down Expand Up @@ -223,7 +264,6 @@ static void DiB_saveDict(const char* dictFileName,
if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
}


typedef struct {
U64 totalSizeToLoad;
unsigned oneSampleTooLarge;
Expand All @@ -235,22 +275,46 @@ typedef struct {
* provides the amount of data to be loaded and the resulting nb of samples.
* This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
*/
static fileStats DiB_fileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, unsigned displayLevel)
static fileStats DiB_fileStats(const char** fileNamesTable, unsigned nbFiles, size_t chunkSize, int displayLevel)
{
fileStats fs;
unsigned n;
memset(&fs, 0, sizeof(fs));

// We assume that if chunking is requsted, the chunk size is < SAMPLESIZE_MAX
assert( chunkSize <= SAMPLESIZE_MAX );

for (n=0; n<nbFiles; n++) {
U64 const fileSize = UTIL_getFileSize(fileNamesTable[n]);
U64 const srcSize = (fileSize == UTIL_FILESIZE_UNKNOWN) ? 0 : fileSize;
U32 const nbSamples = (U32)(chunkSize ? (srcSize + (chunkSize-1)) / chunkSize : 1);
U64 const chunkToLoad = chunkSize ? MIN(chunkSize, srcSize) : srcSize;
size_t const cappedChunkSize = (size_t)MIN(chunkToLoad, SAMPLESIZE_MAX);
fs.totalSizeToLoad += cappedChunkSize * nbSamples;
fs.oneSampleTooLarge |= (chunkSize > 2*SAMPLESIZE_MAX);
fs.nbSamples += nbSamples;
U64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
// TODO: is there a minimum sample size? What if the file is 1-byte?
if (fileSize == 0) {
DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
continue;
}

/* the case where we are breaking up files in sample chunks */
if (chunkSize > 0)
{
// TODO: is there a minimum sample size? Can we have a 1-byte sample?
fs.nbSamples += ((fileSize + chunkSize-1) / chunkSize) * chunkSize;
fs.totalSizeToLoad += fileSize;
}
else {
/* the case where one file is one sample */
if (fileSize > SAMPLESIZE_MAX) {
/* flag excessively large smaple files */
fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);

/* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %ukB",
fileNamesTable[n], SAMPLESIZE_MAX >> 10);
}
fs.nbSamples += 1;
fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
}
}
DISPLAYLEVEL(4, "Preparing to load : %u KB \n", (unsigned)(fs.totalSizeToLoad >> 10));
DISPLAYLEVEL(4, "Number of samples %u\n", fs.nbSamples );
return fs;
}

Expand All @@ -260,18 +324,31 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
ZDICT_fastCover_params_t* fastCoverParams, int optimize)
{
unsigned const displayLevel = params ? params->zParams.notificationLevel :
coverParams ? coverParams->zParams.notificationLevel :
fastCoverParams ? fastCoverParams->zParams.notificationLevel :
0; /* should never happen */
fileStats fs;
void* const dictBuffer = malloc(maxDictSize);
fileStats const fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);

int const displayLevel = params ? params->zParams.notificationLevel :
coverParams ? coverParams->zParams.notificationLevel :
fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;

/* Shuffle input files before we start assessing how much sample date to load.
The purpose of the shuffle is to pick random samples when the sample
set is larger than what we can load in memory. */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: That isn't the only purpose. We also shuffle to improve training, because there are some biases that can be introduced when samples show up in a "sorted" order, that are mitigated by shuffling.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is interesting, I'd like to know more about these biases. I'm not sure what to add to the comment as this point.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe: "The purpose of the shuffle is to avoid bias in the trainer due to sorted files, and to pick random samples ..."

I'd be happy to explain more how the dictionary builder works, and how sorting can introduce bias, but is probably too verbose to put into a comment. It would be worth a meeting + writing up a doc, so others can read it too.

DISPLAYLEVEL(3, "Shuffling input files\n");
DiB_shuffle(fileNamesTable, nbFiles);

/* Figure out how much sample data to load with how many samples */
fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);

size_t* const sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
size_t const memMult = params ? MEMMULT :
coverParams ? COVER_MEMMULT:
FASTCOVER_MEMMULT;
size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, fs.totalSizeToLoad);
/* Limit the size of the training data to the free memory */
/* Limit the size of the training data to 2GB */
/* TODO: there is oportunity to stop DiB_fileStats() early when the data limit is reached */
size_t loadedSize = MIN( MIN(maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
void* const srcBuffer = malloc(loadedSize+NOISELENGTH);
int result = 0;

Expand All @@ -296,13 +373,13 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,

/* init */
if (loadedSize < fs.totalSizeToLoad)
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(loadedSize >> 20));
DISPLAYLEVEL(1, "Trainig samples set too large (%u MB); training on %u MB only...\n",
(unsigned)(fs.totalSizeToLoad >> 20),
(unsigned)(loadedSize >> 20));

/* Load input buffer */
DISPLAYLEVEL(3, "Shuffling input files\n");
DiB_shuffle(fileNamesTable, nbFiles);

DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable, nbFiles, chunkSize, displayLevel);
DiB_loadFiles(srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
nbFiles, chunkSize, displayLevel);

{ size_t dictSize;
if (params) {
Expand Down