# Find and Remove Duplicates param ( [string]$check_dir, [string]$backup_dir, [switch]$show_dups, [switch]$delete ) function ShowUsage { $message = @" Find and Remove Duplicates usage: ** Use with caution ** frd.ps1 -check_dir [-delete -backup_dir (backup location)] [-show_dups] -check_dir (where to look for duplicates) -backup_dir (location to backup files before deletion) -delete # will delete the duplicates -show_dups # will show the duplicates ** A typical dirctory to check would be the Office downloads directory: -check_dir 'C:\Program Files\Microsoft Office\Updates\Download\PackageFiles' ** The backup directory can be any writable location, the destination directory is created: -backup_dir 'C:\tmp\backups' ** Files can only be deleted if you have the needed privledge "@ Write-Output $message exit 1 } function CheckUsage { if (-not $delete -and -not $show_dups) { Write-Output "`n*** ERROR: No work requested, bye" ShowUsage } if (-not $check_dir) { Write-Output "`n*** ERROR: -check_dir option not specified" ShowUsage } if ($delete -and -not $backup_dir) { Write-Output "`n*** ERROR: When -delete is specified, you need to also specify the -backup_dir option" ShowUsage } Write-Output "> Checking for duplicates in: $check_dir" if ($delete) { Write-Output "> Looking for duplicates to delete..." } if ($show_dups) { Write-Output "> Showing Duplicates..." } if ($backup_dir) { Write-Output "> Backing up to: $backup_dir" } } class FoundFile { [string]$Name [string]$FullName [string]$Directory [int]$Size [double]$fileSizeInMB [double]$fileSizeInKB [datetime]$Creation [string]$Hash FoundFile([System.IO.FileInfo] $file, [string]$hash) { $this.Name = $file.Name $this.FullName = $file.FullName $this.Directory = $file.DirectoryName $this.Size = $file.Length $this.fileSizeInMB = [math]::Round($this.Size / 1MB, 4) $this.fileSizeInKB = [math]::Round($this.Size / 1KB, 2) $this.Creation = $file.CreationTimeUtc $this.Hash = $hash } # Method to display information [void]DisplayInfo() { Write-Output "hello" } [string]ToString() { return "$($this.Directory) $($this.Name) $(Format-NumberWithCommas $this.fileSizeInKB) KB $($this.Creation)" } } function ShowSizes() { param ( [Parameter(Mandatory = $true, Position = 0)] [string]$description, [Parameter(Mandatory = $true, Position = 1)] [Int64]$sizeInBytes ) # Convert the size to kilobytes (KB) $sizeInKB = $sizeInBytes / 1KB # Convert the size to megabytes (MB) $sizeInMB = $sizeInBytes / 1MB # Convert the size to gigabytes (GB) $sizeInGB = $sizeInBytes / 1GB # Display the size in different units Write-Output "" Write-Output "$description $(Format-NumberWithCommas -Number $sizeInBytes) bytes." Write-Output "$description $(Format-NumberWithCommas -Number $sizeInKB, 2) KB." Write-Output "$description $(Format-NumberWithCommas -Number $sizeInMB, 2) MB." Write-Output "$description $(Format-NumberWithCommas -Number $sizeInGB, 4) GB." } function Format-NumberWithCommas { param ( [Parameter(Mandatory = $true, Position = 0)] # no idea why this has to be System.Object, suggestions don't work # for converting [math]::Round result into double and using a type of [double] [System.Object]$Number, [Parameter(Position = 1)] [int]$NumDecimals = 0 ) # TODO: sort out why this isn't working as expected, works fine on the command line! return "{0:N$NumDecimals}" -f $Number } # Function to compute file hash function Get-FileHash ($path) { $stream = [System.IO.File]::OpenRead($path) $sha256 = New-Object System.Security.Cryptography.SHA256Managed $hash = $sha256.ComputeHash($stream) $stream.Close() return [BitConverter]::ToString($hash) -replace '-', '' } function Get-AllFilesByHash { param ( [Parameter(Mandatory = $true)] [string]$Dirname ) # Get all files in the directory $files = Get-ChildItem -Path $Dirname -File -Recurse # Create a hashtable to store file hashes $hashTable = @{} # Iterate through each file and compute hash foreach ($file in $files) { $hash = Get-FileHash $file.FullName # initialize array for this hash if needed if (!$hashTable.ContainsKey($hash)) { $hashTable[$hash] = @() } $ff = [FoundFile]::new($file, $hash) #Write-Output $ff.DisplayInfo() $hashTable[$hash] += $ff } $hashTable } function Get-Duplicates { param ( [Parameter(Mandatory = $true)] [hashtable] $files ) $dups_hash = @{} foreach ($entry in $files.GetEnumerator()) { $dup_files = $entry.Value if ($dup_files.Count -gt 1) { $dups_hash[$entry.key] = $dup_files | Sort-Object -Property Creation -Descending } } $dups_hash } function Show-Duplicates { param ( [Parameter(Mandatory = $true)] [hashtable] $files ) $num_dups = 0 $total_dup_size = 0.0 foreach ($entry in $files.GetEnumerator()) { $dup_files = $entry.Value $num_dups += 1 if ($dup_files.Count -lt 2) { throw "duplicates collection contains non duplicate for entry: $($entry.Key))" } Write-Output "" Write-Output "Found duplicate $(Format-NumberWithCommas $num_dups): $($entry.Key)" for ($i = 0; $i -lt $dup_files.Count; $i++) { # only count disk space of removals if ($i -gt 0) { $total_dup_size += $dup_files[$i].Size } Write-Output "$($hash_num): $($dup_files[$i].ToString())" } } Write-Output "" Write-Output "Found $(Format-NumberWithCommas $num_dups) duplicate file hashes." ShowSizes "Duplicate Files Size" $total_dup_size } function BackupDuplicateFile { param ( [Parameter(Mandatory = $true)] [FoundFile] $file, [Parameter(Mandatory = $true)] [string] $backupDirectory ) # Define the source file path $sourceFilePath = Join-Path -Path $file.Directory -ChildPath $file.Name # Get the FileInfo object for the source file $fileInfo = Get-Item -Path $sourceFilePath # Define the backup directory Write-Output "backup directory is: $backupDirectory" # Ensure the backup directory exists if (-not (Test-Path -Path $backupDirectory)) { New-Item -Path $backupDirectory -ItemType Directory } # Remove the drive letter using Split-Path $pathWithoutDrive = Split-Path -Path $File.Directory -NoQualifier # Construct the full backup file path #$fullBackupDirectory = Join-Path -Path $backupDirectory -ChildPath $File.Directory $fullBackupDirectory = Join-Path -Path $backupDirectory -ChildPath $pathWithoutDrive if (-not (Test-Path -Path $fullBackupDirectory)) { New-Item -Path $fullBackupDirectory -ItemType Directory } # Copy the file to the backup directory $fullDestinationPath = Join-Path -Path $fullBackupDirectory -ChildPath $file.Name $fileInfo.CopyTo($fullDestinationPath, $true) # Output the result # Write-Output "File '$($fileInfo.FullName)'" Write-Output " + backed up to: $fullBackupDirectory" } function DeleteDuplicateFile { param ( [Parameter(Mandatory = $true)] [FoundFile] $File ) Write-Output "" Write-Output "Deleting File: $($File.Name) $(Format-NumberWithCommas $File.fileSizeInKB) KB" BackupDuplicateFile -file $File -backupDirectory $backup_dir # Force delete a read-only or protected file Write-Output " ... deleting: $($File.FullName)" Remove-Item -Path $File.FullName -Force Write-Output " - removed from: $($File.Directory)" } # NOTE: Duplicates are pre-sorted by creation date function ProcessDuplicates { param ( [Parameter(Mandatory = $true)] [hashtable] $files ) $num_deleted = 0 $total_deleted_size = 0 Write-Output "" foreach ($entry in $files.GetEnumerator()) { $dup_files = $entry.Value if ($dup_files.Count -lt 2) { throw "duplicates collection contains non duplicate for entry: $($entry.Key))" } # the first file is the newest, array is sorted in descending order by creation date # delete all the duplicates older than the first file, element 0 for ($i = 1; $i -lt $dup_files.Count; $i++) { $file_to_delete = $dup_files[$i] DeleteDuplicateFile -File $file_to_delete $total_deleted_size += $file_to_delete.Size $num_deleted += 1 } } Write-Output "" Write-Output "Deleted $num_deleted duplicate files" ShowSizes "Recovered Size" $total_deleted_size } CheckUsage $files = Get-AllFilesByHash($check_dir) $dups = Get-Duplicates $files if ($show_dups) { Show-Duplicates $dups } if ($delete) { ProcessDuplicates $dups }