From 68d6b160f983aba161e176b2f278a79064c262bc Mon Sep 17 00:00:00 2001 From: Richard Logwood Date: Fri, 7 Jun 2024 12:18:25 -0400 Subject: [PATCH] first commit --- windows/README.md | 6 ++ windows/frd.ps1 | 238 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 244 insertions(+) create mode 100755 windows/README.md create mode 100755 windows/frd.ps1 diff --git a/windows/README.md b/windows/README.md new file mode 100755 index 0000000..e636bb0 --- /dev/null +++ b/windows/README.md @@ -0,0 +1,6 @@ + + +## `frd.ps1` - find and remove duplicates **WIP** +- Edit the directory in the file and run it to get a report of files to be removed +- Removal step is pending further testing + diff --git a/windows/frd.ps1 b/windows/frd.ps1 new file mode 100755 index 0000000..bbebfae --- /dev/null +++ b/windows/frd.ps1 @@ -0,0 +1,238 @@ +# Find and Remove Duplicates + +# This is the directory we'll check for duplicates +# TODO: make this a parameter +$check_dir = "C:\Program Files\Microsoft Office\Updates\Download\PackageFiles" + +# Methodolgy +# A hash value is computed for each file, files with the same hash are considered duplicate +# Sort the duplicate files found for a hash by their creation date in ascending order +# Deletion canidate will be the first file in the list + + +class FoundFile { + [string]$Name + [string]$Directory + [int]$Size + [double]$fileSizeInMB + [double]$fileSizeInKB + [datetime]$Creation + [string]$Hash + + FoundFile([System.IO.FileInfo] $file, [string]$hash) { + $this.Name = $file.Name + $this.Directory = $file.DirectoryName + $this.Size = $file.Length + $this.fileSizeInMB = [math]::Round($this.Size / 1MB, 4) + $this.fileSizeInKB = [math]::Round($this.Size / 1KB, 2) + $this.Creation = $file.CreationTimeUtc + $this.Hash = $hash + } + + # Method to display information + [void]DisplayInfo() { + Write-Output "hello" + } + + [string]ToString() { + return "$($this.Directory) $($this.Name) $(Format-NumberWithCommas $this.fileSizeInKB) KB $($this.Creation)" + } +} + +function ShowSizes() { + param ( + [Parameter(Mandatory = $true, Position = 0)] + [string]$description, + + [Parameter(Mandatory = $true, Position = 1)] + [int]$sizeInBytes + ) + + # Convert the size to kilobytes (KB) + $sizeInKB = $sizeInBytes / 1KB + + # Convert the size to megabytes (MB) + $sizeInMB = $sizeInBytes / 1MB + + # Convert the size to gigabytes (GB) + $sizeInGB = $sizeInBytes / 1GB + + # Display the size in different units + Write-Output "" + Write-Output "$description $(Format-NumberWithCommas -Number $sizeInBytes) bytes." + Write-Output "$description $(Format-NumberWithCommas -Number $sizeInKB, 2) KB." + Write-Output "$description $(Format-NumberWithCommas -Number $sizeInMB, 2) MB." + Write-Output "$description $(Format-NumberWithCommas -Number $sizeInGB, 4) GB." +} + + +function Format-NumberWithCommas { + param ( + [Parameter(Mandatory = $true, Position = 0)] + # no idea why this has to be System.Object, suggestions don't work + # for converting [math]::Round result into double and using a type of [double] + [System.Object]$Number, + + [Parameter(Position = 1)] + [int]$NumDecimals = 0 + ) + + # TODO: sort out why this isn't working as expected, works fine on the command line! + return "{0:N$NumDecimals}" -f $Number +} + +# Function to compute file hash +function Get-FileHash ($path) { + $stream = [System.IO.File]::OpenRead($path) + $sha256 = New-Object System.Security.Cryptography.SHA256Managed + $hash = $sha256.ComputeHash($stream) + $stream.Close() + return [BitConverter]::ToString($hash) -replace '-', '' +} + + +function Get-AllFilesByHash { + param ( + [Parameter(Mandatory = $true)] + [string]$Dirname + ) + + # Get all files in the directory + $files = Get-ChildItem -Path $Dirname -File -Recurse + + # Create a hashtable to store file hashes + $hashTable = @{} + + # Iterate through each file and compute hash + foreach ($file in $files) { + $hash = Get-FileHash $file.FullName + + # initialize array for this hash if needed + if (!$hashTable.ContainsKey($hash)) { + $hashTable[$hash] = @() + } + $ff = [FoundFile]::new($file, $hash) + #Write-Output $ff.DisplayInfo() + $hashTable[$hash] += $ff + } + $hashTable +} + + +function Get-Duplicates { + param ( + [Parameter(Mandatory = $true)] + [hashtable] $files + ) + $dups_hash = @{} + foreach ($entry in $files.GetEnumerator()) { + $dup_files = $entry.Value + + if ($dup_files.Count -gt 1) { + $dups_hash[$entry.key] = $dup_files | Sort-Object -Property Creation + #$dup_files = $dups_hash[$entry.key] + } + } + $dups_hash +} + + +function Show-Duplicates { + param ( + [Parameter(Mandatory = $true)] + [hashtable] $files + ) + + #$hash_num = 0 + $num_dups = 0 + $total_dup_size = 0 + + foreach ($entry in $files.GetEnumerator()) { + $dup_files = $entry.Value + $num_dups += 1 + + if ($dup_files.Count -lt 2) { + throw "duplicates collection contains non duplicate for entry: $($entry.Key))" + } + + Write-Output "" + Write-Output "Found duplicate $($num_dups): $($entry.Key)" + for ($i = 0; $i -lt $dup_files.Count; $i++) { + $total_dup_size += $dup_files[$i].Size + Write-Output "$($hash_num): $($dup_files[$i].ToString())" + } + + #$hash_num += 1 + } + Write-Output "" + Write-Output "Found $num_dups duplicate file hashes." + ShowSizes "Duplicate Files Size" $total_dup_size + #Write-Output "total dup size = $(Format-NumberWithCommas $total_dup_size) Bytes, $(Format-NumberWithCommas $total_dup_mb) MB" +} + +function Show-Duplicates-v1 { + param ( + [Parameter(Mandatory = $true)] + [hashtable] $files + ) + + $hash_num = 0 + $total_dup_size = 0 + $num_dups = 0 + foreach ($entry in $files.GetEnumerator()) { + $dup_files = $entry.Value + + if ($dup_files.Count -gt 1) { + $num_dups++ + Write-Output "" + Write-Output "Found duplicate $($num_dups): $($entry.Key)" + for ($i = 0; $i -lt $dup_files.Count; $i++) { + $total_dup_size += $dup_files[$i].Size + Write-Output "$($hash_num): $($dup_files[$i].ToString())" + } + } + $hash_num += 1 + } + #$total_dup_mb = [math]::Round($total_dup_size / 1MB, 4) + + Write-Output "" + Write-Output "Found $num_dups duplicate file hashes." + ShowSizes "Duplicate Files Size" $total_dup_size + #Write-Output "total dup size = $(Format-NumberWithCommas $total_dup_size) Bytes, $(Format-NumberWithCommas $total_dup_mb) MB" + +} + +function TBD_Get-Duplicates { + throw "not implemented" + # need to sort the dups by date before deleteing + + $dupsTable = @{} + + $total_dup_size = 0 + + # Find and remove duplicate files + foreach ($hash in $hashTable.Keys) { + $fileGroup = $hashTable[$hash] + if ($fileGroup.Count -gt 1) { + # Keep the first file and delete the rest + $fileGroup[1..($fileGroup.Count - 1)] | ForEach-Object { + Write-Output "Would Delete duplicate file: $_" + $file = Get-Item -Path $_ + $fileSize = $file.Length + $fileName = $file.Name + $total_dup_size += $fileSize + $dupsTable[$fileName] = $fileSize + # Remove-Item -Path $_ -Force + } + } + } +} + + + + + +$files = Get-AllFilesByHash($check_dir) | Select-Object -First 10 +$dups = Get-Duplicates $files +Show-Duplicates $dups +