diff --git a/downloadutils.py b/downloadutils.py index 7864926..f17e702 100755 --- a/downloadutils.py +++ b/downloadutils.py @@ -12,6 +12,7 @@ p.add_argument('--downloadImages', help='Should download images', action='store_true', default=False) p.add_argument('--downloadOriginalImages', help='Should download original images', action='store_true', default=False) p.add_argument('--downloadBoundingBox', help='Should download bouding box annotation files', action='store_true', default=False) + p.add_argument('--replaceIfExists', help='Image should be downloaded even if already exists on direcotry and replace it', action='store_true', default=False) # p.add_argument('--jobs', '-j', type=int, default=1, help='Number of parallel threads to download') # p.add_argument('--timeout', '-t', type=int, default=10, help='Timeout per image in seconds') # p.add_argument('--retry', '-r', type=int, default=10, help='Max count of retry for each image') @@ -31,8 +32,8 @@ if args.downloadImages is True: for id in args.wnid: - list = downloader.getImageURLsOfWnid(id) - downloader.downloadImagesByURLs(id, list) + mapping = downloader.getImageURLsMappingOfWnid(id) + downloader.downloadImagesByURLsMapping(id, mapping, replace_if_exists=args.replaceIfExists) if args.downloadBoundingBox is True: for id in args.wnid: diff --git a/libs/imagedownloader.py b/libs/imagedownloader.py index 4f433cc..43c0d43 100644 --- a/libs/imagedownloader.py +++ b/libs/imagedownloader.py @@ -15,7 +15,7 @@ class ImageNetDownloader: def __init__(self): self.host = 'http://www.image-net.org' - def download_file(self, url, desc=None, renamed_file=None): + def download_file(self, url, desc=None, renamed_file=None, replace_if_exists=False): u = urllib2.urlopen(url) scheme, netloc, path, query, fragment = urlparse.urlsplit(url) @@ -29,6 +29,13 @@ def download_file(self, url, desc=None, renamed_file=None): if desc: filename = os.path.join(desc, filename) + # if the file should not be replaced if already present + # then check if is already exists, if it does then just return the file name + if replace_if_exists is False: + if os.path.isfile(filename): + print("Image already downloaded: {}".format(filename)) + return filename + with open(filename, 'wb') as f: meta = u.info() meta_func = meta.getheaders if hasattr(meta, 'getheaders') else meta.get_all @@ -87,12 +94,38 @@ def getImageURLsOfWnid(self, wnid): return imageUrls + def getImageURLsMappingOfWnid(self, wnid): + url = 'http://www.image-net.org/api/text/imagenet.synset.geturls.getmapping?wnid=' + str(wnid) + f = urllib.urlopen(url) + contents = f.read().split('\n') + imageUrlsMapping = [] + + for each_line in contents: + # Remove unnecessary char + each_line = each_line.replace('\r', '').strip() + if each_line: + # parsing each line into filename and imageUrl + each_line_split = each_line.split(' ') + + if len(each_line_split) != 2: + continue + + filename = each_line_split[0] + imageUrl = each_line_split[1] + + imageUrlsMapping.append({ + 'filename': filename, + 'url': imageUrl + }) + + return imageUrlsMapping + def mkWnidDir(self, wnid): if not os.path.exists(wnid): os.mkdir(wnid) return os.path.abspath(wnid) - def downloadImagesByURLs(self, wnid, imageUrls): + def downloadImagesByURLs(self, wnid, imageUrls, replace_if_exists=False): # save to the dir e.g: n005555_urlimages/ wnid_urlimages_dir = os.path.join(self.mkWnidDir(wnid), str(wnid) + '_urlimages') if not os.path.exists(wnid_urlimages_dir): @@ -100,11 +133,25 @@ def downloadImagesByURLs(self, wnid, imageUrls): for url in imageUrls: try: - self.download_file(url, wnid_urlimages_dir) + self.download_file(url, wnid_urlimages_dir, replace_if_exists=replace_if_exists) except Exception, error: print 'Fail to download : ' + url print str(error) + def downloadImagesByURLsMapping(self, wnid, imageUrlsMapping, replace_if_exists=False): + # save to the dir e.g: n005555_urlimages/ + wnid_urlimages_dir = os.path.join(self.mkWnidDir(wnid), str(wnid) + '_urlimages') + if not os.path.exists(wnid_urlimages_dir): + os.mkdir(wnid_urlimages_dir) + + for imageInfo in imageUrlsMapping: + try: + self.download_file(imageInfo['url'], wnid_urlimages_dir, imageInfo['filename']+'.JPEG', replace_if_exists=replace_if_exists) + except Exception, error: + print 'Fail to download : ' + imageInfo['url'] + print str(error) + + def downloadOriginalImages(self, wnid, username, accesskey): download_url = 'http://www.image-net.org/download/synset?wnid=%s&username=%s&accesskey=%s&release=latest&src=stanford' % (wnid, username, accesskey) try: