[SOLVED] recovering files into directories

knezi · 2015-06-18 17:39:48

Hi,
I have two external HDDs. The B is mirror copy of A, however not fully updated. I accidentally erased data on A. I was able to recover files with Foremost and now I have got lot of files in directories sorted just by extension and then I have partial backup of A on B. I would like to know whether there is an alternative how to delete files which are already stored on B, so that I do not have to go through the whole list?
I am thinking of writing script - make hashes, than sort them and then compare what is missing. I have no idea but I bet it would take a really long time. Is there any simpler way how to do it?
Thank you.

Last edited by knezi (2015-06-30 15:10:42)

ewaller · 2015-06-18 17:56:04

Look into the fdupes command.

Edit: I would create a mount point on the one disk, at mount the other at that mount point.
Then, use the -option on the top level disk

Last edited by ewaller (2015-06-18 17:58:11)

charli3 · 2015-06-19 15:00:18

A noob myself but you might try using some combination of

cp -u -r
# updates recursively only if newer or missing in target directory

I suppose this assumes the directory tree is still intact...

ewaller · 2015-06-19 15:31:42

Or this, which I wrote before I found out about fdupes. Written In Python, depends on PyQt. Uses MD5 sums. It displays the hashes in an expandable tree. Expand the tree to find the files with the hash.

#! /usr/bin/python
"""
Locate identical (based upon MD4 hash) files in a directory tree.  
"""
from optparse import OptionParser
import subprocess,locale
import sys

from PyQt4 import QtGui


options=None

def findDuplicates(args):
        theArgs= ["find", '-H', args[0],'-iname',args[1],'-exec','md5sum', '{}', ';']
        if options.verbose:
                print (theArgs)
        oldValue=['','','']
        returnValue=[]
        nextHash=[]
        firstHit=True
        theList= subprocess.Popen(theArgs,stdout=subprocess.PIPE)
        for x in subprocess.Popen(["sort"],stdin=theList.stdout,stdout=subprocess.PIPE).communicate()[0].decode().split('\n'):
                hit=x.partition(' ')
                if options.verbose:
                        print (hit)
                if (hit[0] == oldValue[0]):
                        if firstHit:
                                nextHash=[oldValue[0], oldValue[2],  hit[2]]
                                firstHit = False
                        else:
                                nextHash.append(hit[2])
                else:
                        if len (nextHash) >0:
                                returnValue.append(nextHash)
                        nextHash=[]
                        firstHit=True
                        oldValue=hit
        return returnValue

def main():
        global options
        usage = "usage: %prog [options] path filespec"
        parser = OptionParser(usage)
        parser.add_option("-v", "--verbose", action="store_true", dest="verbose",  help="Increase verbosity")
        parser.add_option("-G", "--nogui", action="store_true", dest="nogui",  help="Run without GUI")
        (options, args) = parser.parse_args()
        if len(args) != 2:
                parser.error("incorrect number of arguments")
        if options.verbose:
                print (args)
        x= findDuplicates(args)
        if options.nogui:
                for myHash in x:
                        print (myHash[0])
                        for filename in myHash [1:]:
                                print ("   "+filename)
        else:
                app = QtGui.QApplication(sys.argv)
                model = QtGui.QStandardItemModel()
                rootItem= model.invisibleRootItem();
                for myHash  in x:
                        parentItem=QtGui.QStandardItem(myHash[0]);
                        rootItem.appendRow(parentItem)
                        for filename in myHash [1:]:
                                item = QtGui.QStandardItem(filename)
                                parentItem.appendRow(item)
                tree = QtGui.QTreeView()
                tree.setModel(model)
                tree.setWindowTitle(tree.tr("Duplicate Files"))
                tree.resize(640, 480)
                tree.show()
                sys.exit(app.exec_())

if __name__ == "__main__":
        main()

ewaller · 2015-06-19 15:33:49

Or, if you prefer C, this one uses a SHA-256 hash:

/*  This program searches a sub directory tree, performs a 
 *  hash on each file, and compares the hashes to find
 *  duplicate files.  To do this, it sorts the files it
 *  finds by their hashes.
 */

#define MAX_DESCRIPTORS 100
#define CHECKSUM_TYPE GCRY_MD_SHA256 
#define SIZE_OF_CHECKSUM 32
#define TRUE 1
#define FALSE 0

#include <stdlib.h>
#include <stdio.h>
#include <argp.h>
#include <ftw.h>
#include <string.h>
#include <sys/types.h>
#include <regex.h>
#include <gcrypt.h>
#include <assert.h>

typedef enum{
  FILENAME,
  HASH
} SortBy;

typedef struct {
  char *filter;	
  int silent, verbose;
} ProgramArguments;

typedef struct
{
  char* name;
  char* checksum; 
} ResultsStruct;

static struct argp_option options[] =
{
  {"verbose", 'v', 0, 0, "Produce verbose output"},
  {"quiet", 'q', 0, 0, "Do not produce any status output"},
  {"silent", 's', 0, OPTION_ALIAS},
  {"filter", 'f', "REGEX",0,"Filter using REGEX"}, 
  {0}
};

static int fileParser(const char* fileName, const struct stat* theStat, int flags );
static error_t parse_opt (int key, char *arg, struct argp_state *state);

static ResultsStruct* theResults;
static ProgramArguments arguments;
static unsigned int flyWheel;
static regex_t regex;
static gcry_md_hd_t theCryptoEnginePtr;


const char *argp_program_version = "dup";
const char *argp_program_bug_address = "<ewwaller+code@gmail.com>";
static char doc[] =
  "Dup -- Locate duplicate files in a sub directory tree"
  "\v   path   :  Top directory of the search tree in the file system\n"
  "             Multiple directories may be provided, separated by white space"; 

static char args_doc[] = "path [path...] ";
static struct argp argp = { options, parse_opt, args_doc, doc };

static char* defaultRegex=".*";

/* 
 * Functions Follow 
*/

char* 
CreateString(const char* theInitialString)
{
  /* Create a new string to store a copy of an existing string
   * Allocate enough memory to hold the new string, and copy 
   * the characters from the old string to the new.  Return a pointer
   * to the new string
   */

  char* theStringPtr = (char*)malloc(strlen(theInitialString)+1);
  if (!theStringPtr)
    {
      fprintf(stderr,"\nFatal Error:  Out of memory in in string alloction\n");
      exit (1);
    }
  strcpy(theStringPtr,theInitialString);
  return theStringPtr;
}

static 
error_t 
parse_opt (int key, char *arg, struct argp_state *state)
{
  /*
   *  Process the command line arguments and options.  Collect all 
   *  the options and remember their state.  Then, treat all of the 
   *  arguments as directories and iterate through their contents
   *  building the results array with the names and hashes
   */

  ProgramArguments* argumentPtr = state->input;

  switch (key)
    {
    case 'q':
    case 's':
      argumentPtr->silent = 1;
      break;
    case 'v':
      argumentPtr->verbose = 1;
      break;
    case 'f':
      argumentPtr->filter = arg;
      if (regcomp (&regex, arguments.filter, 0))
	{
	  fprintf(stderr,"Unable to parse regular exprsession\n");
	  return EINVAL;
	}
      break;
    case ARGP_KEY_NO_ARGS:
      /* If there are no Arguments, that is bad.  We need at least one */
      argp_usage (state);
    case ARGP_KEY_ARG:
      /* All of the arguments are directories.  Process them one at a time */
      ftw (arg, fileParser, MAX_DESCRIPTORS);
      break;
    default:
      return ARGP_ERR_UNKNOWN;
    }
  return 0;
}

static 
int 
fileParser(const char* fileName, const struct stat* theStat, int flags )
{
  /* Iterate through the given directory and determine, for each file name, 
   * the hash of the contents of the file.  Add those results to the 
   * results array 
   */
  
  char *contents;
  char theCheckSumString[(SIZE_OF_CHECKSUM*2) +1];
  ResultsStruct newResult;
  unsigned int length = 0;
  char theNextChar;
  char c;
  int i;

  if (flags != FTW_F)
    return 0;
  if (S_ISFIFO (theStat->st_mode))
    return 0;
  if (regexec (&regex, fileName, 0,NULL,0))
    return 0;
  
  FILE *theFile = fopen(fileName,"r");
  if (!theFile)
    {
      fprintf(stderr,"\nCould not open the file %s\n",fileName);
      return 0;
    }
  gcry_md_reset (theCryptoEnginePtr);
  while (fread (&theNextChar, sizeof(theNextChar),1,theFile))
    {
      gcry_md_putc(theCryptoEnginePtr,theNextChar);
      length++;
    }
  fclose(theFile);
  
  theResults=(ResultsStruct*)realloc(theResults,(flyWheel+2)*sizeof(ResultsStruct));
  if (!theResults)
    {
      fprintf(stderr,"\nFatal Error:  Out of memory in file Parser\n");
      exit (1);
    }

  for (i=0 ; i<SIZE_OF_CHECKSUM ; i++)
    sprintf(theCheckSumString+2*i,"%02x",gcry_md_read(theCryptoEnginePtr,0)[i]);
   
  (theResults+flyWheel+1)->name=NULL;
  (theResults+flyWheel)->name = CreateString(fileName);
  (theResults+flyWheel)->checksum = CreateString(theCheckSumString);
  if(arguments.verbose)
    {
      printf ("Input: %s : %s ",(theResults+flyWheel)->checksum,fileName);
      printf("Length %i\n",length);
    }
  assert(strlen(theCheckSumString)==strlen((theResults+flyWheel)->checksum));
  flyWheel++;
  assert(!(theResults+flyWheel)->name);
  if (!arguments.silent)
    {
      printf(" Processed %i files\r", flyWheel);
      fflush(stdout);
    }
  return 0; 
}


int 
ResultsCompare(const void *first, const void *second, SortBy* sortType)
{
  /* 
   *  This function is used in the sorting of the results.  It returns
   *  a negative number is first < second, 0 if they are equal, and
   *  a positive number is first > second.  The decision is made based 
   *  upon the hash stored in the entry of the results table.
   *  The sortType Flag tells us whether to sort by file name or hash
   */

  if (*sortType == HASH)  
    return strcmp (((ResultsStruct*)first)->checksum,((ResultsStruct*)second)->checksum);
  else
    return strcmp (((ResultsStruct*)first)->name,((ResultsStruct*)second)->name);
}

static
void 
RemoveDuplicates(ResultsStruct* theList, char* theHashStr)
{
  /*  The user can enter directories that might contain
   *  branches that are not mutually exclusive (ex:  . and ./subdir)
   *  This function searches for duplicate path and files names in the
   *  duplicate candidate list because a file is not a duplicate of
   *  itself.  Sort by names and look for identical names in a row.
   *  Find out how many unique files names there are, if there are more
   *  than one, print their hash and their names.
   */

  SortBy nameSort=FILENAME;
  int i=0;
  int ShaHasBeenPrinted = FALSE;
  ResultsStruct* current = theList;
  char* previousName = NULL;
  unsigned int uniqueCount = 0;
  
  /* The sort needs to know how many elements there are */

  current=theList;
  while(current->name)
    {
      current++;
      i++;
    }
  qsort_r(theList,i,sizeof(ResultsStruct),ResultsCompare, &nameSort);

  previousName = NULL;
  current = theList;

  while(current->name)
    {
      if(previousName)
	if (strcmp(previousName,current->name ))    
	  {
	    if (! ShaHasBeenPrinted)
	      { printf("SHA-256 : %s\n",theHashStr);
		printf("    %s\n",previousName);
		ShaHasBeenPrinted = TRUE;
	      }
	    printf("    %s\n",current->name);
	  }
      previousName = current->name;
      current++;
    }
}

static
void 
ProcessResults(void)
{

  /* Iterate through the results looking for the same hash.  The results
   * are first sorted by hash, so it becomes  a matter of looking for 
   * multiple entries in a row with the same hash 
   */

  ResultsStruct *current= theResults;
  ResultsStruct *previous = NULL;
  unsigned int matched = FALSE;
  SortBy hashSort = HASH;
  ResultsStruct* theFileNameList=0;
  int count = 0;

  if (!theResults)
    return;
  qsort_r(theResults,flyWheel,sizeof(ResultsStruct), ResultsCompare, &hashSort);

  while (current->name)
    {

      /* If the check sum is not the same as it was last time, see
	 If there are more than one VALID file names with the same 
	 check sum.  VALID means the same file name cannot be 
	 duplicated (the directory was scanned twice)
      */

      if (previous)
	{
	  if  (strcmp (previous->checksum,
			   current->checksum))
	    {
	      RemoveDuplicates(theFileNameList,previous->checksum);
	      if (theFileNameList)
		{
		  free(theFileNameList);
		  theFileNameList = NULL;
		  count=0;
		}	      
	    }
	}
      theFileNameList = (ResultsStruct*)realloc(theFileNameList,
						 sizeof(ResultsStruct)*(count+2));
      (theFileNameList+count)->name = current->name;
      (theFileNameList+count)->checksum = current->checksum;
      (theFileNameList+count+1)->name=NULL;
      previous = current++;
      count++;
    }

  /* We reached the end of the list, but ensure that the end
     of the list does not have duplicates that have not yet
     been printed
  */
  
  RemoveDuplicates(theFileNameList,previous->checksum);
  free(theFileNameList);
  theFileNameList = NULL;
}

int
main (int argc, char **argv)
{

  /* Default values. */
  arguments.silent = 0;
  arguments.verbose = 0;
  arguments.filter= defaultRegex;

  regcomp (&regex, arguments.filter, 0);
  if (gcry_md_open (&theCryptoEnginePtr,CHECKSUM_TYPE, 0))
    {
      fprintf(stderr,"Unable to initialize crypto engine\n");
      exit (1);
    }

  /*  Parse our arguments; every option seen by parse_opt will be
   *  reflected in arguments.  This function call also
   *  searches each directory name passed in on the command line
   */
  argp_parse (&argp, argc, argv, 0, 0, &arguments);
  
  gcry_md_close (theCryptoEnginePtr);

  if(!arguments.silent)
    printf("\n");
  if(arguments.verbose)
    {
      printf ("\nfilter = %s\n", arguments.filter);
      printf ("VERBOSE = %s\nSILENT = %s\n",
	      arguments.verbose ? "yes" : "no",
	      arguments.silent ? "yes" : "no");
    }
  ProcessResults();
  exit (0);
}

knezi · 2015-06-30 15:07:39

Thanks a lot for all the replies!
They are great. I have used fdupes.

Arch Linux

#1 2015-06-18 17:39:48

[SOLVED] recovering files into directories

#2 2015-06-18 17:56:04

Re: [SOLVED] recovering files into directories

#3 2015-06-19 15:00:18

Re: [SOLVED] recovering files into directories

#4 2015-06-19 15:31:42

Re: [SOLVED] recovering files into directories

#5 2015-06-19 15:33:49

Re: [SOLVED] recovering files into directories

#6 2015-06-30 15:07:39

Re: [SOLVED] recovering files into directories

Board footer