Joe Gilmore

8 min read

Finding Duplicate files with Node JS

Want a quick and easy way to find duplicate files on your computer? This simple page shows you how to do it with a quick and easy NodeJS script.

Finding Duplicate files with Node JS

Hunting down Duplicate file names

This simple NodeJS Script allows you to hunt in a specific directory for duplicate file names:

const fs = require("fs").promises;
const path = require("path");

// Directory passed to the function when run
const directory = process.argv[2];
const writeToJson = true; // Set to true to write to JSON file, false to log to console

const findDuplicates = async (dir, duplicates, filenames) => {
  const files = await fs.readdir(dir);

  for (const file of files) {
    if (file.startsWith(".")) continue;

    const filePath = path.join(dir, file);
    const filename = path.parse(file).name;

    const stats = await fs.stat(filePath);

    if (stats.isDirectory()) {
      await findDuplicates(filePath, duplicates, filenames);
    } else {
      if (filenames[filename]) {
        duplicates.push(filePath);
      } else {
        filenames[filename] = true;
      }
    }
  }
};

(async () => {
  const duplicates = [];
  const filenames = {};

  await findDuplicates(directory, duplicates, filenames);

  if (writeToJson) {
    await fs.writeFile("duplicates.json", JSON.stringify(duplicates,null,2));
    console.log("duplicates.json has been saved");
  } else {
    console.log(duplicates);
  }
  
})();

Run using : node find-duplicates.js /YourPathName/SubFolder/etc/

It will then output a JSON file with all the duplicate file names, and you can then manually take a look to see if you want to delete them or not.

Please note however, this only compares the filenames regardless of what the file is... we can improve this a bit further by comparing both the filename and the filesize of the file, here we go:

const fs = require("fs").promises;
const path = require("path");

/* To run this file: 
node find-duplicate-files.js "/Volumes/5TB"
*/

// Directory passed to the function when run
const directory = process.argv[2];
const writeToJson = true; // Set to true to write to JSON file, false to log to console

const findDuplicates = async (dir, duplicates, filenames) => {
  const files = await fs.readdir(dir);

  for (const file of files) {
    if (file.startsWith(".")) continue;

    const filePath = path.join(dir, file);
    const filename = path.parse(file).name;

    const stats = await fs.stat(filePath);

    if (stats.isDirectory()) {
      await findDuplicates(filePath, duplicates, filenames);
    } else {
      if (filenames[filename]) {
        if (filenames[filename].size === stats.size) {
          duplicates.push(filePath);
        }
      } else {
        filenames[filename] = stats;
      }
    }
  }
};

(async () => {
  const duplicates = [];
  const filenames = {};

  await findDuplicates(directory, duplicates, filenames);

  if (writeToJson) {
    await fs.writeFile("duplicates.json", JSON.stringify(duplicates,null,2));
    console.log("duplicates.json has been saved");
  } else {
    console.log(duplicates);
  }
  
})();

Now when you run it, it will look for files that share the same name and size... so much more likely to be duplicates.

We could go even further and do some kind of MD5 checksum, but I'll leave that to you to figure out. ๐Ÿ˜