Monday, February 4, 2013

Exporting pdf documents properties to the text file

The following script extracts pdf documents some property fields and dump them to the text file.  Note that if itextsharp.dll does not exist on your system it can be downloaded from http://sourceforge.net/projects/itextsharp/.  To add other fields to output see documentation on http://www.adobe.com.

[System.Reflection.Assembly]::LoadFrom(\\your_server\..\itextsharp.dll)

function Get_PDF_Properties($path){
    Try{
        $raf = New-object iTextSharp.text.pdf.RandomAccessFileOrArray($path)
        $reader = New-object iTextSharp.text.pdf.PdfReader($raf, $Nothing)
        $myPDF= ""| select Title,Subject
        $mypdf.title=($reader).info.item("Title")
        $sub = 0
        if ((($reader).info.item("Subject").Length -ne 0)){
            $mypdf.subject=($reader).info.item("Subject")
            $sub = 1
        }   
    }
    Catch [Exception]{
        if ($sub -eq 0) {
            $mypdf.subject=""
        } else {
            Write-Host "Generic Exception"
            Write-Host $_
            $_ | Select *
        }   
    }
  return $mypdf
}
$in = \\your_server\..\
$out = "\\your_server\..\file_name.txt"
Add-Content -Path $out -Value "ImagePath`tTitle`tSubject" -Encoding UTF8
$dirstr = Get-ChildItem $in -recurse -filter "*.pdf" | Select-Object FullName
foreach ($objItem in $dirstr) {
    $path = $objItem -replace "@{FullName=", ""
    $path = $path -replace "}", ""
    $mypdf = Get_PDF_Properties($path)
    $pdf = $mypdf -replace "@{Title=", ""
    $pdf = $pdf -replace " Subject=", ""
    $pdf = $pdf -replace "}", ""
    $pdf = $pdf -replace ";", "`t"
    write-host "$path`t$pdf"
    Add-Content -Path $out -Value "$path`t$pdf" -Encoding UTF8
}