You are not logged in.
Pages: 1
Hi,
I have two external HDDs. The B is mirror copy of A, however not fully updated. I accidentally erased data on A. I was able to recover files with Foremost and now I have got lot of files in directories sorted just by extension and then I have partial backup of A on B. I would like to know whether there is an alternative how to delete files which are already stored on B, so that I do not have to go through the whole list?
I am thinking of writing script - make hashes, than sort them and then compare what is missing. I have no idea but I bet it would take a really long time. Is there any simpler way how to do it?
Thank you.
Last edited by knezi (2015-06-30 15:10:42)
Offline
Look into the fdupes command.
Edit: I would create a mount point on the one disk, at mount the other at that mount point.
Then, use the -option on the top level disk
Last edited by ewaller (2015-06-18 17:58:11)
Nothing is too wonderful to be true, if it be consistent with the laws of nature -- Michael Faraday
Sometimes it is the people no one can imagine anything of who do the things no one can imagine. -- Alan Turing
---
How to Ask Questions the Smart Way
Offline
A noob myself but you might try using some combination of
cp -u -r
# updates recursively only if newer or missing in target directory
I suppose this assumes the directory tree is still intact...
Offline
Or this, which I wrote before I found out about fdupes. Written In Python, depends on PyQt. Uses MD5 sums. It displays the hashes in an expandable tree. Expand the tree to find the files with the hash.
#! /usr/bin/python
"""
Locate identical (based upon MD4 hash) files in a directory tree.
"""
from optparse import OptionParser
import subprocess,locale
import sys
from PyQt4 import QtGui
options=None
def findDuplicates(args):
theArgs= ["find", '-H', args[0],'-iname',args[1],'-exec','md5sum', '{}', ';']
if options.verbose:
print (theArgs)
oldValue=['','','']
returnValue=[]
nextHash=[]
firstHit=True
theList= subprocess.Popen(theArgs,stdout=subprocess.PIPE)
for x in subprocess.Popen(["sort"],stdin=theList.stdout,stdout=subprocess.PIPE).communicate()[0].decode().split('\n'):
hit=x.partition(' ')
if options.verbose:
print (hit)
if (hit[0] == oldValue[0]):
if firstHit:
nextHash=[oldValue[0], oldValue[2], hit[2]]
firstHit = False
else:
nextHash.append(hit[2])
else:
if len (nextHash) >0:
returnValue.append(nextHash)
nextHash=[]
firstHit=True
oldValue=hit
return returnValue
def main():
global options
usage = "usage: %prog [options] path filespec"
parser = OptionParser(usage)
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="Increase verbosity")
parser.add_option("-G", "--nogui", action="store_true", dest="nogui", help="Run without GUI")
(options, args) = parser.parse_args()
if len(args) != 2:
parser.error("incorrect number of arguments")
if options.verbose:
print (args)
x= findDuplicates(args)
if options.nogui:
for myHash in x:
print (myHash[0])
for filename in myHash [1:]:
print (" "+filename)
else:
app = QtGui.QApplication(sys.argv)
model = QtGui.QStandardItemModel()
rootItem= model.invisibleRootItem();
for myHash in x:
parentItem=QtGui.QStandardItem(myHash[0]);
rootItem.appendRow(parentItem)
for filename in myHash [1:]:
item = QtGui.QStandardItem(filename)
parentItem.appendRow(item)
tree = QtGui.QTreeView()
tree.setModel(model)
tree.setWindowTitle(tree.tr("Duplicate Files"))
tree.resize(640, 480)
tree.show()
sys.exit(app.exec_())
if __name__ == "__main__":
main()
Nothing is too wonderful to be true, if it be consistent with the laws of nature -- Michael Faraday
Sometimes it is the people no one can imagine anything of who do the things no one can imagine. -- Alan Turing
---
How to Ask Questions the Smart Way
Offline
Or, if you prefer C, this one uses a SHA-256 hash:
/* This program searches a sub directory tree, performs a
* hash on each file, and compares the hashes to find
* duplicate files. To do this, it sorts the files it
* finds by their hashes.
*/
#define MAX_DESCRIPTORS 100
#define CHECKSUM_TYPE GCRY_MD_SHA256
#define SIZE_OF_CHECKSUM 32
#define TRUE 1
#define FALSE 0
#include <stdlib.h>
#include <stdio.h>
#include <argp.h>
#include <ftw.h>
#include <string.h>
#include <sys/types.h>
#include <regex.h>
#include <gcrypt.h>
#include <assert.h>
typedef enum{
FILENAME,
HASH
} SortBy;
typedef struct {
char *filter;
int silent, verbose;
} ProgramArguments;
typedef struct
{
char* name;
char* checksum;
} ResultsStruct;
static struct argp_option options[] =
{
{"verbose", 'v', 0, 0, "Produce verbose output"},
{"quiet", 'q', 0, 0, "Do not produce any status output"},
{"silent", 's', 0, OPTION_ALIAS},
{"filter", 'f', "REGEX",0,"Filter using REGEX"},
{0}
};
static int fileParser(const char* fileName, const struct stat* theStat, int flags );
static error_t parse_opt (int key, char *arg, struct argp_state *state);
static ResultsStruct* theResults;
static ProgramArguments arguments;
static unsigned int flyWheel;
static regex_t regex;
static gcry_md_hd_t theCryptoEnginePtr;
const char *argp_program_version = "dup";
const char *argp_program_bug_address = "<ewwaller+code@gmail.com>";
static char doc[] =
"Dup -- Locate duplicate files in a sub directory tree"
"\v path : Top directory of the search tree in the file system\n"
" Multiple directories may be provided, separated by white space";
static char args_doc[] = "path [path...] ";
static struct argp argp = { options, parse_opt, args_doc, doc };
static char* defaultRegex=".*";
/*
* Functions Follow
*/
char*
CreateString(const char* theInitialString)
{
/* Create a new string to store a copy of an existing string
* Allocate enough memory to hold the new string, and copy
* the characters from the old string to the new. Return a pointer
* to the new string
*/
char* theStringPtr = (char*)malloc(strlen(theInitialString)+1);
if (!theStringPtr)
{
fprintf(stderr,"\nFatal Error: Out of memory in in string alloction\n");
exit (1);
}
strcpy(theStringPtr,theInitialString);
return theStringPtr;
}
static
error_t
parse_opt (int key, char *arg, struct argp_state *state)
{
/*
* Process the command line arguments and options. Collect all
* the options and remember their state. Then, treat all of the
* arguments as directories and iterate through their contents
* building the results array with the names and hashes
*/
ProgramArguments* argumentPtr = state->input;
switch (key)
{
case 'q':
case 's':
argumentPtr->silent = 1;
break;
case 'v':
argumentPtr->verbose = 1;
break;
case 'f':
argumentPtr->filter = arg;
if (regcomp (®ex, arguments.filter, 0))
{
fprintf(stderr,"Unable to parse regular exprsession\n");
return EINVAL;
}
break;
case ARGP_KEY_NO_ARGS:
/* If there are no Arguments, that is bad. We need at least one */
argp_usage (state);
case ARGP_KEY_ARG:
/* All of the arguments are directories. Process them one at a time */
ftw (arg, fileParser, MAX_DESCRIPTORS);
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
static
int
fileParser(const char* fileName, const struct stat* theStat, int flags )
{
/* Iterate through the given directory and determine, for each file name,
* the hash of the contents of the file. Add those results to the
* results array
*/
char *contents;
char theCheckSumString[(SIZE_OF_CHECKSUM*2) +1];
ResultsStruct newResult;
unsigned int length = 0;
char theNextChar;
char c;
int i;
if (flags != FTW_F)
return 0;
if (S_ISFIFO (theStat->st_mode))
return 0;
if (regexec (®ex, fileName, 0,NULL,0))
return 0;
FILE *theFile = fopen(fileName,"r");
if (!theFile)
{
fprintf(stderr,"\nCould not open the file %s\n",fileName);
return 0;
}
gcry_md_reset (theCryptoEnginePtr);
while (fread (&theNextChar, sizeof(theNextChar),1,theFile))
{
gcry_md_putc(theCryptoEnginePtr,theNextChar);
length++;
}
fclose(theFile);
theResults=(ResultsStruct*)realloc(theResults,(flyWheel+2)*sizeof(ResultsStruct));
if (!theResults)
{
fprintf(stderr,"\nFatal Error: Out of memory in file Parser\n");
exit (1);
}
for (i=0 ; i<SIZE_OF_CHECKSUM ; i++)
sprintf(theCheckSumString+2*i,"%02x",gcry_md_read(theCryptoEnginePtr,0)[i]);
(theResults+flyWheel+1)->name=NULL;
(theResults+flyWheel)->name = CreateString(fileName);
(theResults+flyWheel)->checksum = CreateString(theCheckSumString);
if(arguments.verbose)
{
printf ("Input: %s : %s ",(theResults+flyWheel)->checksum,fileName);
printf("Length %i\n",length);
}
assert(strlen(theCheckSumString)==strlen((theResults+flyWheel)->checksum));
flyWheel++;
assert(!(theResults+flyWheel)->name);
if (!arguments.silent)
{
printf(" Processed %i files\r", flyWheel);
fflush(stdout);
}
return 0;
}
int
ResultsCompare(const void *first, const void *second, SortBy* sortType)
{
/*
* This function is used in the sorting of the results. It returns
* a negative number is first < second, 0 if they are equal, and
* a positive number is first > second. The decision is made based
* upon the hash stored in the entry of the results table.
* The sortType Flag tells us whether to sort by file name or hash
*/
if (*sortType == HASH)
return strcmp (((ResultsStruct*)first)->checksum,((ResultsStruct*)second)->checksum);
else
return strcmp (((ResultsStruct*)first)->name,((ResultsStruct*)second)->name);
}
static
void
RemoveDuplicates(ResultsStruct* theList, char* theHashStr)
{
/* The user can enter directories that might contain
* branches that are not mutually exclusive (ex: . and ./subdir)
* This function searches for duplicate path and files names in the
* duplicate candidate list because a file is not a duplicate of
* itself. Sort by names and look for identical names in a row.
* Find out how many unique files names there are, if there are more
* than one, print their hash and their names.
*/
SortBy nameSort=FILENAME;
int i=0;
int ShaHasBeenPrinted = FALSE;
ResultsStruct* current = theList;
char* previousName = NULL;
unsigned int uniqueCount = 0;
/* The sort needs to know how many elements there are */
current=theList;
while(current->name)
{
current++;
i++;
}
qsort_r(theList,i,sizeof(ResultsStruct),ResultsCompare, &nameSort);
previousName = NULL;
current = theList;
while(current->name)
{
if(previousName)
if (strcmp(previousName,current->name ))
{
if (! ShaHasBeenPrinted)
{ printf("SHA-256 : %s\n",theHashStr);
printf(" %s\n",previousName);
ShaHasBeenPrinted = TRUE;
}
printf(" %s\n",current->name);
}
previousName = current->name;
current++;
}
}
static
void
ProcessResults(void)
{
/* Iterate through the results looking for the same hash. The results
* are first sorted by hash, so it becomes a matter of looking for
* multiple entries in a row with the same hash
*/
ResultsStruct *current= theResults;
ResultsStruct *previous = NULL;
unsigned int matched = FALSE;
SortBy hashSort = HASH;
ResultsStruct* theFileNameList=0;
int count = 0;
if (!theResults)
return;
qsort_r(theResults,flyWheel,sizeof(ResultsStruct), ResultsCompare, &hashSort);
while (current->name)
{
/* If the check sum is not the same as it was last time, see
If there are more than one VALID file names with the same
check sum. VALID means the same file name cannot be
duplicated (the directory was scanned twice)
*/
if (previous)
{
if (strcmp (previous->checksum,
current->checksum))
{
RemoveDuplicates(theFileNameList,previous->checksum);
if (theFileNameList)
{
free(theFileNameList);
theFileNameList = NULL;
count=0;
}
}
}
theFileNameList = (ResultsStruct*)realloc(theFileNameList,
sizeof(ResultsStruct)*(count+2));
(theFileNameList+count)->name = current->name;
(theFileNameList+count)->checksum = current->checksum;
(theFileNameList+count+1)->name=NULL;
previous = current++;
count++;
}
/* We reached the end of the list, but ensure that the end
of the list does not have duplicates that have not yet
been printed
*/
RemoveDuplicates(theFileNameList,previous->checksum);
free(theFileNameList);
theFileNameList = NULL;
}
int
main (int argc, char **argv)
{
/* Default values. */
arguments.silent = 0;
arguments.verbose = 0;
arguments.filter= defaultRegex;
regcomp (®ex, arguments.filter, 0);
if (gcry_md_open (&theCryptoEnginePtr,CHECKSUM_TYPE, 0))
{
fprintf(stderr,"Unable to initialize crypto engine\n");
exit (1);
}
/* Parse our arguments; every option seen by parse_opt will be
* reflected in arguments. This function call also
* searches each directory name passed in on the command line
*/
argp_parse (&argp, argc, argv, 0, 0, &arguments);
gcry_md_close (theCryptoEnginePtr);
if(!arguments.silent)
printf("\n");
if(arguments.verbose)
{
printf ("\nfilter = %s\n", arguments.filter);
printf ("VERBOSE = %s\nSILENT = %s\n",
arguments.verbose ? "yes" : "no",
arguments.silent ? "yes" : "no");
}
ProcessResults();
exit (0);
}
Nothing is too wonderful to be true, if it be consistent with the laws of nature -- Michael Faraday
Sometimes it is the people no one can imagine anything of who do the things no one can imagine. -- Alan Turing
---
How to Ask Questions the Smart Way
Offline
Thanks a lot for all the replies!
They are great. I have used fdupes.
Offline
Pages: 1