How to Create a Compressible (and other types of) Data Set

I needed a data set with fairly random data in, but not totally random, so I could test out a few things (compression). Since I didn’t have a suitable data set, this tool was created to generate such.

################################################
## Generate-DataSet.ps1 - v1.0 September 2014 ##
################################################
               
Function Generate-DataSetHelp {"
<<<<<<< Generate-DataSet.ps1 >>>>>>>

>>>>> Synopsis <<<<<

Generate-DataSet.ps1 is designed to create various datasets that can be used for testing purposes; such as testing deduplication, compression, and other things.

To run from this file, invoke the functions, and then run the function::

. .\Generate-DataSet.ps1
Generate-DataSet

>>>> The Parameters <<<<<

-Folder              # The folder in which to create the dataset
-Word                # Word to write (overrides Dictionary)
-Dictonary           # Dictionary filepath/file used for words - see http://www.winedt.org/Dict/
-FolderSizeMB        # Folder Size in MB (overrides $FileCount)
-FileCount           # How many files to create
-FileSizeKB          # FileSize in KB (overrides $min/maxWordCount)
-FileSizeKBQuickCalc # Speeds up filesize calculation for large files           
-MinWordCount        # Minimum number of words in a file
-MaxWordCount        # Maximum number of words in a file
-MaxSpaceCount       # Maximum number of spaces between words
-SentenceSize        # Size of sentences (carriage returns after nth word)
-LessOutput          # Less Screen Output (no dot after every word)
-Help                # Displays the Help Output

>>>>> Examples <<<<<

Generate-DataSet -FolderSizeMB 256
Generate-DataSet -Folder datagen -Dictionary us.dic -FileSizeKB 1024 -FileSizeQuickCalc -lessOutput

"}           
               
#####################################
## MAIN FUNCTION: Generate-DataSet ##
#####################################

Function Generate-DataSet {

##################
# Initialization #
##################

Param(

# The folder in which to create the dataset
[string]$Folder,

# Word to write (overrides Dictionary)
[string]$Word,

# Dictionary filepath/file used for words - see http://www.winedt.org/Dict/
[string]$Dictionary,

# Folder Size in MB (overrides $FileCount)
[int]$FolderSizeMB,

# How many files to create
[int]$FileCount = 1000,

# FileSize in KB (overrides $min/maxWordCount)
[int]$FileSizeKB,

# Instead of writing newfile and getting size after every sentence (which can be very slow) -
# - this roughly predicts the size the file will be
[switch]$FileSizeQuickCalc,

# Minimum number of words in a file
[int]$minWordCount = 100,

# Maximum number of words in a file
[int]$maxWordCount = 1000,

# Maximum number of spaces between words
[int]$maxSpaceCount = 10,

# Size of sentences (carriage returns after nth word)
[int]$sentenceSize = 10,

# Less Screen Output (no dot after every word)
[switch]$lessOutput,

# Displays the Help Output
[switch]$help

)

If(!($PSBoundParameters.count)){return Generate-DataSetHelp}
If($help){return Generate-DataSetHelp}

###################
# Validity Checks #
###################

If(!$Folder){$Folder = "DATAGEN"}
$testFolder = Create-Folder $Folder
If(!$testFolder){Wr-R "Folder parameter $Folder invalid";return}
else            {Wr-G "Folder $testFolder passed!"}

If (!$Word -and !$Dictionary){$Word = "Consider supplying -word or -dictionary parameters!"}
If (!$Word){
$dictionaryWords = Get-Content $Dictionary -ErrorAction SilentlyContinue
If(!$dictionaryWords){Wr-R "Unable to read $Dictionary!";return}
else {                Wr-G "Dictionary $Dictionary passed!"}
$dictionaryItems = $dictionaryWords.count
}

If($FolderSizeMB){
If($FolderSizeMB -lt 1){            Wr-R "Invalid Parameter!";return}}
If($FolderSizeKB){
If($FolderSizeKB -lt 1){            Wr-R "Invalid Parameter!";return}}   
If($FileCount -lt 1){               Wr-R "Invalid Parameter!";return}
If($minWordCount -lt 1){            Wr-R "Invalid Parameter!";return}
If($maxWordCount -lt $minWordCount){Wr-R "Invalid Parameter!";return}
If($maxSpaceCount -lt 1){           Wr-R "Invalid Parameter!";return}
If($sentenceSize -lt 1){            Wr-R "Invalid Parameter!";return}
Wr-G "Parameters passed!"

# Because Get-Random X is from 0 to X-1)
$maxWordCount++

######################################################
# Get latest existing XXXXXXXXXX.datagen file number #
######################################################

$files = ( Get-Childitem $Folder | Where-Object { ($_.name).contains(".datagen") } ).Name
If (!$files){ $fileNumber = 1 }
else {
$split = $files[($files.count) -1].Split(".")
$fileNumber = [int]($split[0].TrimStart("0"))
$fileNumber++
}

######################
# Creating the files #
######################

$i = 0 # File count
$blank = "" # We add spaces to this later
$tempFilePath = $Folder + "\datagen_test.temp"

while ($i -lt $FileCount){

$fileNumberString = $fileNumber.ToString().PadLeft(10,"0")
$filePath = $Folder + "\" + $fileNumberString + ".datagen"          
Wn-C "Creating file $filePath"

$newLine     = $null # New Line is a string of words                           
$newFile     = @()   # New File is an array of lines
$wordsInFile = Get-Random -Min $minWordCount -Max $maxWordCount

$j = 0 # Word Count (per file)
$k = 0 # Word Count (per sentence)

while ($j -lt $wordsInFile){

Wn-W "." # Progress dot

# Adds word/dictionaryWord to line
If($Word) {$newLine += $Word}
else{
$wordNumber = Get-Random $dictionaryItems
$newLine   += $dictionaryWords[$wordNumber]
}

$j++
$k++

# Handles end of sentence or end of words                                       
If ( ($k -eq $sentenceSize) -or ($j -eq $wordsInFile) ){
$newFile += $newLine
$newLine = $null
$k = 0                                                   
} else {
$spaces     = (Get-Random $maxSpaceCount) + 1
$newLine   += $blank.PadRight($spaces," ")
}

# Handles File Size if checking (only checks after have written a sentence to the file)
If ( $FileSizeKB ){ $j = 0 } # Mode is check File Size, this overrides word counting!
If ( $FileSizeKB -and ($k -eq 0) ){
If ($FileSizeQuickCalc){
$str = [String]$newFile
$fileSizeInKB = ($str.length - $newFile.count)*2/1024
} else {
$newFile | Out-File $tempFilePath
$fileSizeInKB = ((Get-Item $tempFilePath).length)/1024
}
If ($fileSizeInKB -gt $FileSizeKB){
$j = $wordsInFile
Wr-E;Wr-Y "File $filePath is $fileSizeInKB KB!"
}
}

} # REPEAT for more Words

# Create the file

$newFile | Out-File $filePath
$fileNumber++
$i ++
If(!$FileSizeKB -or !$lessOutput){Wr-E}

If ($FolderSizeMB){
$folderSizeInB = (Get-ChildItem $Folder | Measure-Object -property length -sum).Sum
$folderSizeInMB = [int]($folderSizeInB / (1024*1024))
If($folderSizeInMB -gt $FolderSizeMB){Wr-R "Folder size is $folderSizeInMB MB";return}
else {                                Wr-G "Folder size is $folderSizeInMB MB"}
$i = 0 # Mode is check for Folder Size, this overrides file counting!
}

} # REPEAT for more Files

} # END Generate-DataSet

###################
## SUB Functions ##
###################

Function Create-Folder {

# If no argument supplied, return NULL
If(!$args[0]){ return $null }

# Test path $args[0], and if there, return $args[0]
If(Test-Path $args[0] -ErrorAction SilentlyContinue){ return ($args[0]) }

# Otherwise create a new folder
Else {[Void](New-item $args[0] -type directory -ErrorAction SilentlyContinue)}

# Test path $args[0], and if there, return $args[0]
If(Test-Path $args[0] -ErrorAction SilentlyContinue){ return ($args[0]) }

# Otherwise return NULL
Else { return $null }

} # END Create-Folder

Function Wr-E{Write-Host}
Function Wr-C{Write-Host ($args[0]) -F Cyan}
Function Wr-G{Write-Host ($args[0]) -F Green}
Function Wr-R{Write-Host ($args[0]) -F Red}
Function Wr-W{Write-Host ($args[0]) -F White}
Function Wr-Y{Write-Host ($args[0]) -F Yellow}
Function Wn-C{Write-Host ($args[0]) -F Cyan -NoNewline}
Function Wn-G{Write-Host ($args[0]) -F Green -NoNewline}
Function Wn-R{Write-Host ($args[0]) -F Red -NoNewline}
Function Wn-W{If(!$lessOutput){Write-Host ($args[0]) -F White -NoNewline}}
Function Wn-Y{Write-Host ($args[0]) -F Yellow -NoNewline}

Comments