Saturday, 28 April 2012

Utility application - Remove duplicate files within directory (C#)


The following is a small (non Ax-related) utility application that will scan a directory and it's sub-directories for duplicates. Any duplicates are removed, and as a final step empty paths are also deleted.

I wrote this to clean-up my Pictures folder where copies had been accidentally moved around.

Additional comments in the code. To build, copy the source into a C# Console-Application.


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Collections;

namespace CheckDuplicateFiles
{
    /// <summary>
    /// This program will check for duplicate files within the path specified
    /// (as the first command-line argument). It will remove any duplicates, and 
    /// then finally delete any paths that contain no files within it or it's
    /// child directories.
    /// 
    /// The check to determine a duplicate file is based on the file length,
    /// and a hash of the first 1024 bytes of the file data.
    /// Error checking is minimal.
    /// 
    /// </summary>
    class Program
    {        
        static void Main(string[] args)
        {
            new Program().Run(args[0]);
        }

        Dictionary<string, List<string>> Entries = new Dictionary<string, List<string>>();


        public void Run(string rootPath)
        {
            // Map all files
            this.CheckDirectory(rootPath);

            // Delete duplicates
            int filesDeleted = 0;
            foreach (KeyValuePair<string, List<string>> entry in this.Entries)
            {
                List<string> files = entry.Value;
                if (files.Count > 1)
                {
                    // Duplicates exist
                    Console.WriteLine(files[0]);

                    foreach (string file in files.Skip(1))
                    {
                        File.Delete(file);
                        filesDeleted++;
                    }                    
                }
            }

            Console.WriteLine(String.Format("{0} files deleted", filesDeleted));

            // Delete empty directories (empty meaning no files in it or it's
            // sub-directories).
            this.DeleteEmptyDirs(rootPath);

        }

        /// <summary>
        /// Check whether the specified path is empty (no files in it
        /// or it's subdirectories).
        /// </summary>
        /// <param name="path"></param>
        public void DeleteEmptyDirs(string path)
        {
            if (!Directory.Exists(path))
                return; // May have already been deleted.
            if (FilesInDirAndSubDirs(path) == 0)
            {
                Console.WriteLine(String.Format("PATH DELETED: {0}", path));
                Directory.Delete(path, true);
            }
            else
            {
                foreach (string subDir in Directory.GetDirectories(path))
                    this.DeleteEmptyDirs(Path.Combine(path, subDir));
            }
        }

        /// <summary>
        /// Count files in specified path and all sub-directories (recursed).
        /// </summary>
        /// <param name="path"></param>
        /// <returns></returns>
        public int FilesInDirAndSubDirs(string path)
        {
            int total = Directory.GetFiles(path).Length;
            foreach (string subDir in Directory.GetDirectories(path))
            {
                total += FilesInDirAndSubDirs(Path.Combine(path, subDir));
            }
            return total;
        }

        /// <summary>
        /// Recursively build map of all files.
        /// </summary>
        /// <param name="path"></param>
        public void CheckDirectory(string path)
        {            
            foreach (string file in Directory.GetFiles(path))
            {
                string key = this.GetKeyForFile(file);

                List<string> array;

                if(this.Entries.ContainsKey(key))
                    array = this.Entries[key];
                else
                    array = new List<string>();

                this.Entries[key] = array;
                array.Add(file);
            }

            foreach (string subDirectory in Directory.GetDirectories(path))
            {
                this.CheckDirectory(Path.Combine(path, subDirectory));
            }
        }

        /// <summary>
        /// Build hash-key for the specified file (based on length and data hash).
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        public string GetKeyForFile(string file)
        {
            using (FileStream inFile = new FileStream(file, FileMode.Open))
            {
                long fileLength = new FileInfo(file).Length;
                byte[] buf = new byte[fileLength];

                int readLength = 1024;
                if (readLength > fileLength)
                    readLength = (int)fileLength;

                inFile.Read(buf,0,readLength);
                int hashCode = this.ByteArrayHashCode(buf);

                return String.Format("{0} {1}", fileLength, hashCode);               
            }            
        }

        /// <summary>
        /// Generate hash code for specified buffer. NB byte[].GetHashCode
        /// is not suitable because it just hashes the array memory location.
        /// </summary>
        /// <param name="data"></param>
        /// <returns></returns>
        public int ByteArrayHashCode(params byte[] data)
        {
            unchecked
            {
                const int p = 16777619;
                int hash = (int)2166136261;

                for (int i = 0; i < data.Length; i++)
                    hash = (hash ^ data[i]) * p;

                hash += hash << 13;
                hash ^= hash >> 7;
                hash += hash << 3;
                hash ^= hash >> 17;
                hash += hash << 5;
                return hash;
            }
        }

    }
}

6 comments:

  1. Your CheckDirectory function will NOT detect files from the subdirectory if it exists.

    My workaround for the function:

    private Dictionary> CheckDirectory(string path)
    {
    Dictionary> entriesFrom = new Dictionary>();
    Dictionary> entriesTo = new Dictionary>();


    foreach (string file in GetFonts(path))
    {
    string key = GetKeyForFile(file);

    List array;

    array = entriesFrom.ContainsKey(key) ? entriesFrom[key] : new List();

    entriesFrom[key] = array;
    array.Add(file);
    }

    foreach (string subDirectory in Directory.GetDirectories(path))
    {
    entriesTo = CheckDirectory(Path.Combine(path, subDirectory));
    }
    entriesFrom.ToList().ForEach(x => entriesTo[x.Key] = x.Value);
    return entriesTo;
    }

    ReplyDelete
    Replies
    1. Hi Rudie - You may have misunderstood the intention of the code. I'm struggling to follow your amendment ('GetFonts' ?) - The CheckDirectory function first scans all files in the supplied path, then recursively calls itself on all sub-directories within that path. Each entry in the associative array is the file key, ie hash of contents/name etc, and an array of files that match that key, for later comparison/cleanup. Your code that assigns the value to the key of the map entry doesn't make any sense. I suggest you have another look and/or debug the code as it runs.

      Delete
  2. This is probably a stupid question, but where are you supposed to supply the path?

    ReplyDelete
  3. This is probably a stupid question, but where are you supposed to supply the path?

    ReplyDelete
  4. This is probably a stupid question, but where are you supposed to supply the path?

    ReplyDelete
    Replies
    1. The base path is supplied as a parameter to 'Run'.

      Delete