NetApp Technical Report (TR) PowerShell Collector Script: Version 2.1

UPDATE: This doesn't work any more, not since the URL path to the TRs was changed. The path is now a bit random and I don't think it is possible to easily fix this. It was fun whilst it lasted!

A minor update of the NetApp Technical Report (TR) PowerShell Collector Script: Version 2. I’ve been doing a bit of blog house keeping and moved the NetApp TR catalogue into dedicated pages, and have the links to these pages in the sidebar. The script is updated to get the PDF titles from the new pages. The updated script allows me to have multiple source pages, rather than one big one. The new pages have the HTML links to the TRs.

Image: Pages Sidebar (standard/non-mobile view)

The Script


##########################################
## NetAppTR_Collector.ps1 (Version 2.1) ##
##########################################

# This program will collect all the TRs from -
# https://www.netapp.com/us/media/tr-XXXX.pdf
# - where XXXX is specified by $RangeStart to $RangeEnd
# Version 2 adds:
# 1) Checks for file existence on the TR-XXXX part (not the whole file name)
# 2) Automatically names the PDFs from http://www.cosonok.com/2017/07/netapp-technical-reports-catalogue.html
# 3) Logs updated files and new files (as well as dead links - the log filename has been changed)
# Version 2.1:
# 1) PDF names now come from:
# http://www.cosonok.com/p/netapp-trs-4500-4999.html
# http://www.cosonok.com/p/netapp-trs-4000-4499.html

Param(
  [Int]$RangeStart = 4000,
  [Int]$RangeEnd   = 4700,
  [String]$DownloadFolder = "C:\Downloads\NetAppTRs\",
  [String]$LogFile = "NetAppTR_Collector.log"
)

# Checks if download folder path has \ at the end:
If($DownloadFolder -match '.+?\\$'){}
else{$DownloadFolder += "\"}

[String]$LogFilePath = $DownloadFolder + $LogFile
[System.Array]$CatalogURLs = @()
$CatalogURLs += "http://www.cosonok.com/p/netapp-trs-4500-4999.html"
$CatalogURLs += "http://www.cosonok.com/p/netapp-trs-4000-4499.html"

# Generic display function:
Function Wr{Param($P,$I="WHITE");Write-Host $P -ForegroundColor $I}

##########################
## TITLE EXTRACTER CODE ##
##########################

[System.Array]$URLdata = @()
Foreach($URL in $CatalogURLs){
  Try{$WebData = Invoke-Webrequest $URL -Method Get}
  Catch{
    Wr "FAILED: Unable to acquire catalog from $URL!" RED;PAUSE;EXIT
  }
  $URLdata += $WebData
}

## Get-Title function:
Function Get-Title{
  Foreach($WebPage in $URLdata){
    [System.Array]$Arr = $WebPage.Content.Split("`n")
    $Catalog = $WebPage.Content
    $Count = $Arr.Count
    $Position = 0
    For($k=$Position;$k -lt $Count;$k++){
      If($Arr[$k].Contains("href") -and $Arr[$k].Contains("TR-$($j):")){
        $Position++
        [String]$Title = $Arr[$k].Split(">")[1]
        $l=1
        While($TRUE){
          If($Title.Contains("<")){
            $Title = $Title.Split("<")[0]
            RETURN $Title
          }else{
            $Title += " "
            $Title += $Arr[$k+$l]
          }
          $l++
          If(($k+$l) -ge $Count){
            Wr "FAILED: Something went wrong here!" RED;PAUSE;EXIT
          }
        }
      }
    }
  }
  RETURN "TR-$($j)"
}

## Create hashtable of titles to number XXXX:
[System.Object]$Titles = @{}
For($j=$RangeEnd;$j -ge $RangeStart;$j--){
  [String]$Titles."$j" = "TR-$($j)"
  $Titles."$j" = (Get-Title).Replace(":"," -").Replace("/",",")
}

#################################
## PREPARE DOWNLOAD FOLDER/LOG ##
#################################

[Void](New-Item -ItemType Directory -Force -Path $DownloadFolder)
If(Test-Path $DownloadFolder){}
else{Wr "FAILED: Test-Path $DownloadFolder" RED;PAUSE;EXIT}
If(Test-Path $LogFilePath){
  Copy-Item $LogFilePath ($LogFilePath + ".bak")
}else{
  [Void](New-Item $LogFilePath -type file -force)
}

######################
## PDF RENAMER CODE ##
######################

Get-ChildItem $DownloadFolder | Foreach{
  $CIname = $_.Name
  If($CIname.StartsWith("TR-","CurrentCultureIgnoreCase")){
    [String]$TRnumber = $CIname.Split("-")[1]
    $TRnumber = $TRnumber.Substring(0,4)   
    If($Titles.$TRnumber){
      If($CIname -ne ($Titles.$TRnumber + ".pdf")){
        Wr "Renaming item: $($DownloadFolder)$($CIname) to $($Titles.$TRnumber).pdf"
        Rename-Item ($DownloadFolder + $CIname) ($Titles.$TRnumber + ".pdf")
      }
    }
  }
}

#######################
## TR COLLECTOR CODE ##
#######################

## Download the TRs:
Import-Module BitsTransfer
For($i=$RangeStart; $i -le $RangeEnd; $i++){
  $Title = "tr-$i"
  $WebUrl = "https://www.netapp.com/us/media/$Title.pdf"
  $SavePath = ($DownloadFolder + $Titles."$i" + ".pdf")

  ## Get the header:
  $hdr = $NULL
  Try{$hdr = Invoke-WebRequest $WebUrl -Method Head}
  Catch{"Dead link: $WebUrl" | Out-File $LogFilePath -Append}

  ## If we have a header:
  If($hdr){

    ## Only download updated versions:
    $Download = $FALSE     
    If(Test-Path $SavePath){
      $SavedLRT = [DateTime]((Get-ChildItem $SavePath).LastWriteTime)
      $WebFileLRT = [DateTime]($hdr.headers."last-modified")
      If($WebFileLRT.Ticks -gt $SavedLRT.Ticks){
        Wr "UPDATED   : $Title" GREEN
        "Updated  : $Title" | Out-File $LogFilePath -Append
        $Download = $TRUE
      }else{
        Wr "NO UPD.   : $Title"
      }
    }else{
      Wr "NEW D/LOAD: $Title" GREEN
      "New D/L  : $Title" | Out-File $LogFilePath -Append
      $Download = $TRUE
    }

    ## Download:
    If($Download){
      Start-BitsTransfer -Source $WebUrl -Destination $SavePath
    }
  }else{
    Wr "DEAD LINK : $WebUrl"
  }
}

PAUSE


Comments