1
1
import os
2
2
import sys
3
- import hashlib
3
+ # import 5 hash functions from hashlib
4
+ import timeit
5
+ from hashlib import md5 , sha1 , blake2b
6
+ from xxhash import xxh64 , xxh128
4
7
8
+ hashfunc = 0
9
+ import_module = "import random"
5
10
6
- def find_duplicates (folders ):
11
+ def duplicates (folders ):
7
12
dup_size = {}
8
13
for i in folders :
9
14
if os .path .exists (i ):
@@ -17,14 +22,12 @@ def find_duplicates(folders):
17
22
for dup_list in dup_size .values ():
18
23
if len (dup_list ) > 1 :
19
24
join_dicts (dups , find_duplicate_hash (dup_list ))
20
- print_results (dups )
21
- return dups
22
25
23
26
24
27
def find_duplicate_size (parent_dir ):
25
28
dups = {} # format {size:[filepaths]}
26
29
for dirName , subdirs , fileList in os .walk (parent_dir ):
27
- # print(dirName, subdirs, fileList)
30
+ print (dirName , subdirs , fileList )
28
31
print ('Scanning %s ' % dirName )
29
32
for filename in fileList :
30
33
path = os .path .join (dirName , filename )
@@ -62,7 +65,24 @@ def join_dicts(dict1, dict2):
62
65
63
66
def hashfile (path , blocksize = 65536 ):
64
67
file = open (path , 'rb' )
65
- hasher = hashlib .md5 ()
68
+
69
+ hasher = md5 ()
70
+ # use switch case for hash functions
71
+ match hashfunc :
72
+ case 0 :
73
+ hasher = md5 ()
74
+ case 1 :
75
+ hasher = sha1 ()
76
+ case 2 :
77
+ hasher = blake2b ()
78
+ case 3 :
79
+ hasher = xxh64 ()
80
+ case 4 :
81
+ hasher = xxh128 ()
82
+ case _:
83
+ print ("Invalid hash function" )
84
+ sys .exit (1 )
85
+
66
86
buf = file .read (blocksize )
67
87
while len (buf ) > 0 :
68
88
hasher .update (buf )
@@ -88,21 +108,45 @@ def print_results(dict1):
88
108
89
109
90
110
def find_duplicates (dir ):
91
- # parser = argparse.ArgumentParser(description='Find duplicate files')
92
- # parser.add_argument('folders', metavar='dir', type=str, nargs='+',help='A directory to parse for duplicates',)
93
- # args = parser.parse_args()
94
- # dir=input("Enter the directory names to find for duplicates: ").split(" ")
95
- dups = find_duplicates ([dir ])
96
- print (dups )
97
- return dups
111
+ a = duplicates ([dir ])
112
+ return a
98
113
99
114
def remove_duplicates (dups ):
100
115
if len (dups ):
101
116
for dup in dups :
102
117
for i in range (1 , len (dups [dup ])):
103
118
os .remove (dups [dup ][i ])
104
119
print ("Duplicates deleted" )
120
+ return True
105
121
else :
106
122
print ("Duplicates not deleted" )
123
+ return False
107
124
108
-
125
+ def main ():
126
+ # dir=input("Enter the directory names to find for duplicates: ").split(" ")
127
+ dir = "D:\\ GitHub\\ codebrewers-hackathon"
128
+ dir2 = "D:\\ GitHub\\ climateview"
129
+ dir3 = "D:\\ GitHub\\ js-samples"
130
+ dir4 = "D:\\ GitHub\\ MemoryGrid"
131
+ dir5 = "D:\\ GitHub\\ portfolio"
132
+ results = []
133
+ for i in range (5 ):
134
+ print ("Hash function" , i ,"is being used" )
135
+ hashfunc = i
136
+ starttime = timeit .default_timer ()
137
+ find_duplicates (dir )
138
+ find_duplicates (dir2 )
139
+ find_duplicates (dir3 )
140
+ find_duplicates (dir4 )
141
+ find_duplicates (dir5 )
142
+ results .append (timeit .default_timer ()- starttime )
143
+ print ("\n Time taken for 5 hash functions: " )
144
+ print ("md5: " , results [0 ], " seconds" )
145
+ print ("sha1: " , results [1 ], " seconds" )
146
+ print ("blake2b: " , results [2 ], " seconds" )
147
+ print ("xxh64: " , results [3 ], " seconds" )
148
+ print ("xxh128: " , results [4 ], " seconds" )
149
+
150
+
151
+ if __name__ == '__main__' :
152
+ main ()
0 commit comments