I needed a data set with fairly random data in, but not
totally random, so I could test out a few things (compression). Since I didn’t
have a suitable data set, this tool was created to generate such.
################################################
## Generate-DataSet.ps1 -
v1.0 September 2014 ##
################################################
Function Generate-DataSetHelp
{"
<<<<<<<
Generate-DataSet.ps1 >>>>>>>
>>>>> Synopsis
<<<<<
Generate-DataSet.ps1 is
designed to create various datasets that can be used for testing purposes; such
as testing deduplication, compression, and other things.
To run from this file, invoke
the functions, and then run the function::
. .\Generate-DataSet.ps1
Generate-DataSet
>>>> The
Parameters <<<<<
-Folder # The folder in which to create
the dataset
-Word # Word to write (overrides
Dictionary)
-Dictonary # Dictionary filepath/file used for
words - see http://www.winedt.org/Dict/
-FolderSizeMB # Folder Size in MB (overrides
$FileCount)
-FileCount # How many files to create
-FileSizeKB # FileSize in KB (overrides
$min/maxWordCount)
-FileSizeKBQuickCalc # Speeds
up filesize calculation for large files
-MinWordCount # Minimum number of words in a file
-MaxWordCount # Maximum number of words in a file
-MaxSpaceCount # Maximum number of spaces between words
-SentenceSize # Size of sentences (carriage returns
after nth word)
-LessOutput # Less Screen Output (no dot after
every word)
-Help # Displays the Help Output
>>>>> Examples
<<<<<
Generate-DataSet
-FolderSizeMB 256
Generate-DataSet -Folder
datagen -Dictionary us.dic -FileSizeKB 1024 -FileSizeQuickCalc -lessOutput
"}
#####################################
## MAIN FUNCTION:
Generate-DataSet ##
#####################################
Function Generate-DataSet {
##################
# Initialization #
##################
Param(
# The folder in which to create the dataset
[string]$Folder,
# Word to write (overrides Dictionary)
[string]$Word,
# Dictionary filepath/file used for words - see
http://www.winedt.org/Dict/
[string]$Dictionary,
# Folder Size in MB (overrides $FileCount)
[int]$FolderSizeMB,
# How many files to create
[int]$FileCount
= 1000,
# FileSize in KB (overrides $min/maxWordCount)
[int]$FileSizeKB,
# Instead of writing newfile and getting size after
every sentence (which can be very slow) -
# - this roughly predicts the size the file will be
[switch]$FileSizeQuickCalc,
# Minimum number of words in a file
[int]$minWordCount
= 100,
# Maximum number of words in a file
[int]$maxWordCount
= 1000,
# Maximum number of spaces between words
[int]$maxSpaceCount
= 10,
# Size of sentences (carriage returns after nth word)
[int]$sentenceSize
= 10,
# Less Screen Output (no dot after every word)
[switch]$lessOutput,
# Displays the Help Output
[switch]$help
)
If(!($PSBoundParameters.count)){return
Generate-DataSetHelp}
If($help){return
Generate-DataSetHelp}
###################
# Validity Checks #
###################
If(!$Folder){$Folder
= "DATAGEN"}
$testFolder
= Create-Folder $Folder
If(!$testFolder){Wr-R
"Folder parameter $Folder invalid";return}
else {Wr-G "Folder $testFolder
passed!"}
If
(!$Word -and !$Dictionary){$Word = "Consider supplying -word or
-dictionary parameters!"}
If
(!$Word){
$dictionaryWords
= Get-Content $Dictionary -ErrorAction SilentlyContinue
If(!$dictionaryWords){Wr-R
"Unable to read $Dictionary!";return}
else
{ Wr-G "Dictionary
$Dictionary passed!"}
$dictionaryItems
= $dictionaryWords.count
}
If($FolderSizeMB){
If($FolderSizeMB
-lt 1){ Wr-R "Invalid
Parameter!";return}}
If($FolderSizeKB){
If($FolderSizeKB
-lt 1){ Wr-R "Invalid
Parameter!";return}}
If($FileCount
-lt 1){ Wr-R "Invalid
Parameter!";return}
If($minWordCount
-lt 1){ Wr-R "Invalid
Parameter!";return}
If($maxWordCount
-lt $minWordCount){Wr-R "Invalid Parameter!";return}
If($maxSpaceCount
-lt 1){ Wr-R "Invalid
Parameter!";return}
If($sentenceSize
-lt 1){ Wr-R "Invalid
Parameter!";return}
Wr-G
"Parameters passed!"
#
Because Get-Random X is from 0 to X-1)
$maxWordCount++
######################################################
# Get latest existing XXXXXXXXXX.datagen file number
#
######################################################
$files
= ( Get-Childitem $Folder | Where-Object {
($_.name).contains(".datagen") } ).Name
If
(!$files){ $fileNumber = 1 }
else
{
$split
= $files[($files.count) -1].Split(".")
$fileNumber
= [int]($split[0].TrimStart("0"))
$fileNumber++
}
######################
# Creating the files #
######################
$i =
0 # File count
$blank
= "" # We add spaces to this later
$tempFilePath
= $Folder + "\datagen_test.temp"
while
($i -lt $FileCount){
$fileNumberString
= $fileNumber.ToString().PadLeft(10,"0")
$filePath
= $Folder + "\" + $fileNumberString + ".datagen"
Wn-C
"Creating file $filePath"
$newLine = $null # New
Line is a string of words
$newFile = @()
# New File is an array of lines
$wordsInFile
= Get-Random -Min $minWordCount -Max $maxWordCount
$j
= 0 # Word Count (per file)
$k
= 0 # Word Count (per sentence)
while
($j -lt $wordsInFile){
Wn-W
"." # Progress dot
# Adds word/dictionaryWord to line
If($Word)
{$newLine += $Word}
else{
$wordNumber
= Get-Random $dictionaryItems
$newLine += $dictionaryWords[$wordNumber]
}
$j++
$k++
# Handles end of sentence or end of words
If
( ($k -eq $sentenceSize) -or ($j -eq $wordsInFile) ){
$newFile
+= $newLine
$newLine
= $null
$k
= 0
}
else {
$spaces = (Get-Random $maxSpaceCount) + 1
$newLine += $blank.PadRight($spaces," ")
}
# Handles File Size if checking (only checks after have
written a sentence to the file)
If
( $FileSizeKB ){ $j = 0 } # Mode is check File
Size, this overrides word counting!
If
( $FileSizeKB -and ($k -eq 0) ){
If
($FileSizeQuickCalc){
$str
= [String]$newFile
$fileSizeInKB
= ($str.length - $newFile.count)*2/1024
}
else {
$newFile
| Out-File $tempFilePath
$fileSizeInKB
= ((Get-Item $tempFilePath).length)/1024
}
If
($fileSizeInKB -gt $FileSizeKB){
$j
= $wordsInFile
Wr-E;Wr-Y
"File $filePath is $fileSizeInKB KB!"
}
}
}
# REPEAT for more Words
# Create the file
$newFile
| Out-File $filePath
$fileNumber++
$i
++
If(!$FileSizeKB
-or !$lessOutput){Wr-E}
If
($FolderSizeMB){
$folderSizeInB
= (Get-ChildItem $Folder | Measure-Object -property length -sum).Sum
$folderSizeInMB
= [int]($folderSizeInB / (1024*1024))
If($folderSizeInMB
-gt $FolderSizeMB){Wr-R "Folder size is $folderSizeInMB MB";return}
else
{ Wr-G
"Folder size is $folderSizeInMB MB"}
$i
= 0 # Mode is check for Folder Size, this overrides
file counting!
}
} # REPEAT for more Files
} # END Generate-DataSet
###################
## SUB Functions ##
###################
Function Create-Folder {
# If no argument supplied, return NULL
If(!$args[0]){
return $null }
# Test path $args[0], and if there, return $args[0]
If(Test-Path
$args[0] -ErrorAction SilentlyContinue){ return ($args[0]) }
# Otherwise create a new folder
Else
{[Void](New-item $args[0] -type directory -ErrorAction SilentlyContinue)}
# Test path $args[0], and if there, return $args[0]
If(Test-Path
$args[0] -ErrorAction SilentlyContinue){ return ($args[0]) }
# Otherwise return NULL
Else
{ return $null }
} # END Create-Folder
Function Wr-E{Write-Host}
Function Wr-C{Write-Host
($args[0]) -F Cyan}
Function Wr-G{Write-Host
($args[0]) -F Green}
Function Wr-R{Write-Host
($args[0]) -F Red}
Function Wr-W{Write-Host
($args[0]) -F White}
Function Wr-Y{Write-Host
($args[0]) -F Yellow}
Function Wn-C{Write-Host
($args[0]) -F Cyan -NoNewline}
Function Wn-G{Write-Host
($args[0]) -F Green -NoNewline}
Function Wn-R{Write-Host
($args[0]) -F Red -NoNewline}
Function Wn-W{If(!$lessOutput){Write-Host
($args[0]) -F White -NoNewline}}
Function Wn-Y{Write-Host
($args[0]) -F Yellow -NoNewline}
Comments
Post a Comment