-
Notifications
You must be signed in to change notification settings - Fork 1
/
pixivRoot.py
executable file
·624 lines (562 loc) · 24.2 KB
/
pixivRoot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
# coding=utf-8
import threading
import urllib
import urllib2
import re
import os
import glob
import sys
import getopt
import math
import time
import Queue
import json
import inspect
# !!! TODO
# • is {"Connection":"keep-alive"} being properly used? is it needed? is it helping? is it hurting?
#escape a filename [sounds bad, but I really don't have a guarantee that this is enough]
def esc(m):
s = str(m)
# dots are only useful for path traversal if nearby slashes, which we get rid of, so they should only be dangerous by themselves
if s == "." :return "。"
if s == "..":return "。。"
s = re.sub('\\\\','\',s)
s = re.sub('/','/',s)
s = re.sub('"','”',s)
s = re.sub('\'','’',s)
s = re.sub('\*','*',s)
s = re.sub('\$','$',s)
s = re.sub(':',':',s) # Apple's handling of this being a special character
return s
# if folder exists, good. if not, and no similars, make it. if not and similar[s], take first similar and rename it.
def assertFolder(foldername=None,wildname=None):
if not os.path.isdir(foldername):
matchA = glob.glob(wildname)
if len(matchA) == 0:
os.mkdir(foldername)
else:
os.rename(matchA[0],foldername)
def printUsage():
global p
sys.stdout.write("usage : python pixivRoot.py [-t threadcount] [-c cookie] [-h] [--help]"+os.linesep)
sys.stdout.write("example : python pixivRoot.py"+os.linesep)
sys.stdout.write("example : python pixivRoot.py -t 32"+os.linesep)
sys.stdout.write("example : python pixivRoot.py -c PHPSESSID=01234567_0abcd123e4567f8g90hijk1234567lmn"+os.linesep)
sys.stdout.write("example : python pixivRoot.py -h"+os.linesep)
sys.stdout.write("example : python pixivRoot.py --help"+os.linesep)
sys.stdout.write("-h : display help information, which is what you're reading right now"+os.linesep)
sys.stdout.write("--help : display help information, which is what you're reading right now"+os.linesep)
sys.stdout.write("-t threadcount : number of threads used"+os.linesep)
sys.stdout.write(" type : integer | minimum : 1 | maximum : "+str(p["threadMaxC"])+" | default : "+str(p["threadC"])+os.linesep)
sys.stdout.write(" • as you use more threads, your download rate and CPU usage will rise"+os.linesep)
sys.stdout.write(" • out of courtesy toward pixiv, I recommend keeping threadcount relatively low"+os.linesep)
sys.stdout.write(" • pixiv is implied to have both an account and an IP blocking mechanism - scary stuff"+os.linesep)
sys.stdout.write("-c cookie : bypass sign-in process by using a specified cookie"+os.linesep)
sys.stdout.write(" this method is much more reliable than scripted sign-in, but requires manual labor and must be performed every time the cookie expires"+os.linesep)
sys.stdout.write(" the only part of the cookie that likely matters is the \"PHPSESSID\" key-value pair"+os.linesep)
sys.stdout.write(" to get this value, http sniff a signed-in request to pixiv's main page and copy the cookie that gets sent to pixiv"+os.linesep)
sys.stdout.write(" this value is marked as \"HttpOnly\", making this process impossible to JavaScript-ify"+os.linesep)
sys.stdout.flush()
def ll(m,colorS="default",noLineBreakF=False):
global p
sys.stdout.write(("" if colorS=="default" else p["cliColorO"][colorS])+str(m)+("" if colorS=="default" else p["cliColorO"]["end"])+("" if noLineBreakF else os.linesep))
sys.stdout.flush()
def warn(m):
ll(m,"y")
def fail(m):
ll(m,"r")
sys.exit()
# warning : not diamond-solid
def readFile(filenameS):
# os.path.isfile(filenameS)
try:
file = open(filenameS,"r")
except EOError as err:
return None
txt = file.read()
file.close()
return txt
# warning : not diamond-solid
def assertFile(filenameS):
if not os.path.isfile(filenameS):
file = open(filenameS,"w")
file.write("")
file.close()
def extractLinkData(linkS,method="GET",dataO={},headerO={},returnFalseOnFailureF=False):
# 22 Mar 2020
# • pixiv is now requiring User-Agent in the header, otherwise will serve a 403(,1010[?]) error
# 17 Jan 2021
# - pixiv is now requiring that the User-Agent start with something reasonable, such as "Mozilla/5.0".
if "User-Agent" not in headerO:
headerO["User-Agent"] = "Mozilla/5.0"
req = urllib2.Request(linkS,urllib.urlencode(dataO),headerO)
req.get_method = lambda : method
try:response = urllib2.urlopen(req)
except urllib2.HTTPError as err:
if returnFalseOnFailureF:return False
else:fail(linkS+" : "+str(err))
res = {"txt":response.read(),"txtHeader":str(response.info()),}
response.close()
return res
def p_extractTxt(linkS,returnFalseOnFailureF=False):
global PHPSessionID
cookie = p["cookieOverrideS"] if p["cookieOverrideS"] is not None else "PHPSESSID="+PHPSessionID
reqE = extractLinkData(linkS,"GET",{},{"Cookie":cookie,"Connection":"keep-alive",},True)
if reqE == False:
if returnFalseOnFailureF:
return False
else:
fail("ERROR : Failed to extract text from "+linkS+" (fetch error)")
return reqE["txt"]
def p_extractJson(linkS,returnFalseOnFailureF=False):
try:
return json.loads(p_extractTxt(linkS))
except:
if returnFalseOnFailureF:
return False
else:
fail("ERROR : Failed to extract JSON from "+linkS+" (decode error)")
def p_extractJsonO(linkS,returnFalseOnFailureF=False):
res = p_extractJson(linkS,returnFalseOnFailureF)
if type(res) is not dict:
if returnFalseOnFailureF:
return False
else:
fail("ERROR : Failed to extract JSON Object from "+linkS+" (root type not Object)")
return res
def p_regex(patternS,subjectS,returnFalseOnFailureF=False):
m = re.search(patternS,subjectS)
if m is None:
if returnFalseOnFailureF:
return False
else:
fail("regex parser failed to parse : "+subjectS+" with pattern : "+patternS)
l = []
l.append(m.group(0))
return l+list(m.groups())
def p_any(m,fxn):
paramC = len(inspect.getargspec(fxn).args)
if type(m) is dict:
if paramC == 0:
for k,v in m.items():
if fxn():
return True
if paramC == 1:
for k,v in m.items():
if fxn(v):
return True
if paramC == 2:
for k,v in m.items():
if fxn(v,k):
return True
if paramC == 3:
for k,v in m.items():
if fxn(v,k,m):
return True
elif type(m) is list:
if paramC == 0:
for i,v in enumerate(m):
if fxn():
return True
if paramC == 1:
for i,v in enumerate(m):
if fxn(v):
return True
if paramC == 2:
for i,v in enumerate(m):
if fxn(v,i):
return True
if paramC == 3:
for i,v in enumerate(m):
if fxn(v,i,m):
return True
return False
# Source : [https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks]
def p_lChunk(l,n):
for i in xrange(0,len(l),n):
yield l[i:i+n]
p = {
"threadC" : 16,
"threadMaxC" : 64,
"cookieOverrideS" : None,
"indentS" : " ",
"emailS" : "",
"passwordS" : "",
"userIDA" : [],
"jobEQueue_stage1" : Queue.Queue(),
"jobEQueue_stage2" : Queue.Queue(),
"pageTypeSA" : ["p","ugoira"], # the order here is by probability
"extensionSA" : [".jpg",".png",".gif"],} # the order here is by probability
# command-line interface colors, enabled for mac os x where I know it works, disabled everywhere else
# https://docs.python.org/2/library/sys.html#platform
# System | platform value
# --------------------+---------------
# Linux (2.x and 3.x) | 'linux2'
# Windows | 'win32'
# Windows/Cygwin | 'cygwin'
# Mac OS X | 'darwin'
# OS/2 | 'os2'
# OS/2 EMX | 'os2emx'
# RiscOS | 'riscos'
# AtheOS | 'atheos'
if sys.platform == "darwin":
p["cliColorO"] = {
"r" : "\033[91m",
"g" : "\033[92m",
"b" : "\033[94m",
"c" : "\033[96m",
"m" : "\033[95m",
"y" : "\033[93m",
"gray" : "\033[90m",
"end" : "\033[0m",
# plain color [colored BIU exists, look it up if you want it]
"bold" : "\033[1m",
"underline" : "\033[4m",}
else:
p["cliColorO"] = {
"r" : "",
"g" : "",
"b" : "",
"c" : "",
"m" : "",
"y" : "",
"gray" : "",
"end" : "",
# plain color [colored BIU exists, look it up if you want it]
"bold" : "",
"underline" : "",}
ll("---- START ----","c")
ll("To stop this program, use Control+Z for Apple Operating Systems.","m")
# handle command-line arguments
# ----------------------------------------------------------------------------------------------------------------------
try:optA,leftoverA = getopt.getopt(sys.argv[1:],'htc:T:',['help'])
except getopt.GetoptError as err:printUsage();fail("ERROR : "+str(err))
for opt,arg in optA:
if opt in ["-h","--help"]:printUsage();sys.exit()
if opt in ["-t"]:
try:p["threadC"] = int(arg)
except ValueError as err:fail("ERROR : [-t threadcount] argument not integer : "+arg)
if p["threadC"] < 1:fail("ERROR : [-t threadcount] argument too small (min:1) : "+arg)
if p["threadC"] > p["threadMaxC"]:fail("ERROR : [-t threadcount] argument too large (max:"+str(p["threadMaxC"])+") : "+arg)
if opt in ["-c"]:
p["cookieOverrideS"] = str(arg)
# handle userIDA.txt
# ----------------------------------------------------------------------------------------------------------------------
assertFile("userIDA.txt")
txt = readFile("userIDA.txt")
# remove comments
txt = re.sub(re.compile('//.*$',re.MULTILINE),'',txt)
# parse for ints
userIDSA = re.split('\\D+',txt)
for userIDS in userIDSA:
if userIDS != "": # because of how the regex split that I wrote works, blanks may show up at the front and back
p["userIDA"].append(int(userIDS))
p["jobEQueue_stage1"].put({"classnameS":"Stage1Job","argO":{"userID":int(userIDS)}},False)
if len(p["userIDA"]) == 0:fail("ERROR : Fill in your userIDA.txt file with pixiv userIDs (the number found in the URL bar for a profile page), one per line")
# handle login.txt
# ----------------------------------------------------------------------------------------------------------------------
assertFile("login.txt")
txt = readFile("login.txt")
# remove comments
txtA = txt.splitlines()
if len(txtA) < 2:fail("ERROR : Fill in your login.txt file with email on first line, password on second line")
p["emailS"] = txtA[0]
p["passwordS"] = txtA[1]
ll("userID List : "+str(p["userIDA"]))
# pixiv login to obtain PHPSESSID cookie
# ----------------------------------------------------------------------------------------------------------------------
PHPSessionID = None
if p["cookieOverrideS"] is None:
ll("Chosen authentication method : sign in.","m");
# open the login page
reqE = extractLinkData("https://accounts.pixiv.net/login","GET")
# get the form callback key, there are two places to find it, I chose the JSON location
# L> <input type="hidden" name="post_key" value="4d0dc83acbe2f27ba139be1559c4455d">
# L> "pixivAccount.postKey":"4d0dc83acbe2f27ba139be1559c4455d"
m = re.search('"pixivAccount\.postKey":"(.+?)"',reqE["txt"])
if not m:fail("ERROR : could not find login callback key [developer's fault - pixiv changed their login page format]")
postKey = m.group(1)
ll("postKey GET!! : "+postKey,"m")
# get the callback cookie from the header
m = re.search('PHPSESSID=(.+?);',reqE["txtHeader"])
if not m:fail("ERROR : could not find login callback cookie [developer's fault - pixiv changed their login page format]")
PHPSessionID = m.group(1)
ll("PHPSessionID GET!! : "+PHPSessionID,"m")
#....
# make the signin request
ll("Attempting to sign in as user:"+p["emailS"],"m")
reqE = extractLinkData("https://accounts.pixiv.net/api/login?lang=en","POST",{"pixiv_id":p["emailS"],"password":p["passwordS"],"captcha":"","g_recaptcha_response":"","post_key":postKey,"source":"accounts",},{"Connection":"keep-alive","Cookie":"PHPSESSID="+PHPSessionID,})
ll("pixiv says : "+reqE["txt"],"m")
# get the newest cookie before we proceed
m = re.search('PHPSESSID=(.+?);',reqE["txtHeader"])
if not m:
msgS = "ERROR : could not find login response cookie"
msgS += os.linesep+p["indentS"]+"Perhaps supplied sign-in information is invalid (please check your authentication text file)."
msgS += os.linesep+p["indentS"]+"Perhaps your account is being captcha-gated (please try signing in with a web browser to look for anomalies)."
msgS += os.linesep+p["indentS"]+"Perhaps your account has been [temporarily] locked (please try signing in with a web browser to look for anomalies)."
msgS += os.linesep+p["indentS"]+"Perhaps pixiv changed their login page format, thwarting this script until the developer updates it (contact the developer)."
msgS += os.linesep+p["indentS"]+"You could always attempt to create and use a new account, updating your authentication text file accordingly."
msgS += os.linesep+p["indentS"]+"You could always attempt to use the cookie override command-line variable, the downside being it involves manual labor. Use --help to see information about this option."
fail(msgS)
PHPSessionID = m.group(1)
ll("PHPSessionID GET!! : "+PHPSessionID,"m")
else:
ll("Chosen authentication method : cookie override.","m");
# scan each artist for image download links
# ----------------------------------------------------------------------------------------------------------------------
class Stage1Job():
def __init__(self,userID=0):
self.userID = userID
self.foldername = None
def run(self):
global p
userIDS = str(self.userID)
imageEA = []
try: # try-except-finally misused here so that "return" takes it to the finally block, where we can wrap up
# GET USERNAME
#-------------
txtS = p_extractTxt("https://www.pixiv.net/users/"+userIDS,True)
if txtS is False:
msgS = "WARNING : User:"+str(userIDS)+"'s page is not viewable."
msgS += os.linesep+p["indentS"]+"This could be an authentication issue causing a redirect loop (if it happens for all artists)."
msgS += os.linesep+p["indentS"]+"This could be a closed account (if it only happens for a few artists)."
warn(msgS)
return
# Ensure we're looking at a valid, signed-in page. If not, stop the entire program.
# !!! Of squishy use in this particular spot. Preferably in a more global-ish one-time place.
if p_regex('class\s*?=\s*?["\']([^"\']+?\s)*?welcome["\'\s]',txtS,True):
fail("ERROR : The signin failed [developer's fault - pixiv changed their artist page format].")
if p_regex('class\s*?=\s*?["\']([^"\']+?\s)*?error\-title["\'\s]',txtS,True): # Don't rely on Japanese/English message.
warn("WARNING : User:"+str(userIDS)+"'s page is not viewable [probably a closed account].")
return
# [21 Sep 2019] change from '<title>「(.+?)」'
usernameS = p_regex('<title>(.+?) - pixiv</title>',txtS)[1]
self.foldername = esc(usernameS+"#"+userIDS)
assertFolder(foldername=self.foldername,wildname="*"+esc("#"+userIDS))
# read from bottom to top
#----
# 28 Sep 2018
# [officially reported [17 Sep 2018] by AshtonHarding on GitHub]
# • pixiv has a new look for works - unfortunately only rolled out to some users randomly
# • they really went all-in with their async API -ness ; whole Stage1Job class needed a makeover
# • thankfully, they're using their own API, and now it's all API-based
# • thankfully, we now have a way to almost instantly glean all work information (except for file extensions)
# • thankfully, this new version of pixivRoot works for both old and new UI types
#----
# 29 Dec 2016
# • pixiv has placeholder links before it asynchronously loads images
#m = re.findall('data-src="(https?:\/\/[^<>\s]+?\/)[^<>\s]+?(\d{4}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/)(\d+)_p(\d+)[^<>\s]+?(\.[^\."]+)"',reqE["txt"])
#....
# 19 Dec 2016
# • pixiv changed their HTML by moving the class:thumbnail portion
# • the regex dot finder was encountering catastrophic backtracking, so changed them all to [^<>] to stay within the confines of the local HTML tag
#m = re.findall('<img src="(https?:\/\/[^<>]+?\/)[^<>]+?(\d{4}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/)(\d+)_p(\d+)[^<>]+?(\.[^\."]+)"',reqE["txt"])
#....
# original [unknown date]
#m = re.findall('<img src="(https?:\/\/.+?\/).+?(\d{4}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/)(\d+)_p(\d+).+?(\.[^\.]+)" class="_thumbnail">',reqE["txt"])
#----
# GET ALL ILLUST ID
#------------------
datO = p_extractJsonO("https://www.pixiv.net/ajax/user/"+userIDS+"/profile/all")
illustIDNA = map(int,list(set((datO["body"]["illusts"].keys() if type(datO["body"]["illusts"]) is dict else []) + (datO["body"]["manga"].keys() if type(datO["body"]["manga"]) is dict else []))));illustIDNA.sort() # The label "manga" seems to be a misnomer. These works to appear in the old theme's illust pages.
ll( userIDS .rjust( 9," ")+" user"
+ " | "+str(len(illustIDNA)).rjust(27," ")+" gallery count"
+ " | "+usernameS +" name"
,"c")
if len(illustIDNA) == 0:
return
#ll(illustIDNA)
# COMPILE ROLODEX0 (GALLERY LIST)
#--------------------------------
# [!] This API page is limited to 100 entries at a time.
rolodex0 = []
for illustIDNChunkA in p_lChunk(illustIDNA,100):
datO = p_extractJsonO("https://www.pixiv.net/ajax/user/"+userIDS+"/profile/illusts?"+("&".join(map(lambda illustIDN:"ids%5B%5D="+str(illustIDN),illustIDNChunkA)))+"&is_manga_top=0&work_category=&is_first_page=0") # [21 Sep 2019] forced to add &work_category=&is_first_page=0, interestingly work_category doesn't need a value
rolodex0 += map(lambda tuple:{
"illustS" : str(tuple[1]["id" ]), # [!] Crucial str() calls to convert unicode type to str type.
"pageC" : tuple[1]["pageCount"] ,
"urlNotS" : str(tuple[1]["url" ]),
"userS" : str(tuple[1]["userId" ]),},datO["body"]["works"].iteritems())
rolodex0 = sorted(rolodex0,key=lambda k:int(k["illustS"]))
#ll("rolodex0.keys()"+str(map(lambda o:o["illustS"],rolodex0)))
# COMPILE ROLODEX1 (WORK LIST)
#-----------------------------
rolodex1 = []
for row in rolodex0:
for pageN in range(0,row["pageC"]):
rolodex1.append({
"illustS" : row["illustS"],
"pageS" : str(pageN) ,
"urlNotS" : row["urlNotS"],
"userS" : row["userS" ],})
ll( userIDS .rjust( 9," ")+" user"
+ " | "+str(len(rolodex1)).rjust(30," ")+" work count"
+ " | "+usernameS +" name"
,"b")
#ll("rolodex1.keys()"+str(map(lambda o:o["illustS"],rolodex1)))
# COMPILE DOWNLOAD JOBS
#----------------------
extensionSA_ALTER = list(p["extensionSA"])
pageTypeSA_ALTER = list(p["pageTypeSA" ])
for row1 in rolodex1:
row2 = {
"illustS" : row1["illustS"],
"pageS" : row1["pageS" ],
"refererS" : "https://www.pixiv.net/",
"userS" : row1["userS" ],}
localF = False
for extensionS in p["extensionSA"]:
pathS = self.foldername+"/"+esc(row2["illustS"]+"_p"+str(row2["pageS"])+extensionS)
if os.path.isfile(pathS):
localF = True
break
if localF:
# [21 Oct 2018] Seeing this text was getting annoying since it doesn't matter much.
#ll( row2["userS" ].rjust(9," ")+" user"
#+ " | "+row2["illustS"].rjust(9," ")+" illust"
#+ " | "+row2["pageS" ].rjust(3," ")+" page"
#+ " | "+("✕ DL" if localF else "◯ DL")
#+ " | "+usernameS +" name"
#+ " | "+pathS
#,"default")
continue
row2["dateS" ] = p_regex('\d{4}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/',row1["urlNotS"])[0]
row2["domainS"] = p_regex('https?:\/\/[^<>\s]+?\/' ,row1["urlNotS"])[0]
# Guess at what the URL is from a small pool of possibilities.
# Would obviously love to just know this ahead of time, but can't figure out the internal-API call for that.
breakN = 0
remoteF = False
for pageTypeS in pageTypeSA_ALTER:
breakN = 0
for extensionS in extensionSA_ALTER:
breakN = 0
urlS = row2["domainS"]+"img-original/img/"+row2["dateS"]+row2["illustS"]+"_"+pageTypeS+str(row2["pageS"])+extensionS
pathS = self.foldername +"/"+esc(row2["illustS"]+"_"+pageTypeS+str(row2["pageS"])+extensionS)
reqE = extractLinkData(urlS,"HEAD",{},{"Referer":row2["refererS"],"Connection":"keep-alive",},returnFalseOnFailureF=True)
if reqE != False:
if pageTypeS == "ugoira":
ll( row2["userS" ].rjust(9," ")+" user"
+ " | "+row2["illustS"].rjust(9," ")+" illust"
+ " | "+row2["pageS" ].rjust(3," ")+" page"
+ " | "+"✕ DL"
+ " | "+usernameS +" name"
+ " | "+pathS
+ " | "+"[!] ugoira not yet supported."
,"y")
breakN = 3
break
remoteF = True
breakN = 2
break
if breakN >= 2:
break
if breakN >= 3:
continue # SURPRISE
if not remoteF:
fail("Could not find extension of illust:"+row2["illustS"])
# Dynamically reorder the URL-guess arrays to reflect the most recently found entry.
# Certain artists seem to favor certain formats, so this should improve performance over a hard-coded priority order.
# Remove the element and insert it at the front, for the next go-around to read.
pageTypeSA_ALTER.remove(pageTypeS) # Element ought to be in collection.
pageTypeSA_ALTER.insert(0,pageTypeS)
extensionSA_ALTER.remove(extensionS) # Element ought to be in collection.
extensionSA_ALTER.insert(0,extensionS)
ll( row2["userS" ].rjust(9," ")+" user"
+ " | "+row2["illustS"].rjust(9," ")+" illust"
+ " | "+row2["pageS" ].rjust(3," ")+" page"
+ " | "+("✕ DL" if localF else "◯ DL")
+ " | "+usernameS +" name"
+ " | "+pathS
+ " | "+urlS
,"g")
imageEA.append({"url":urlS,"referer":row2["refererS"],"pathLocal":pathS})
finally:
if len(imageEA) >= 1:
imageEQueue = Queue.Queue()
for imageE in reversed(imageEA):
imageEQueue.put(imageE)
p["jobEQueue_stage2"].put({"classnameS":"Stage2Job","argO":{"imageEQueue":imageEQueue}})
class Stage2Job():
def __init__(self,imageEQueue):
self.imageEQueue = imageEQueue
def run(self):
while True:
try:
imageE = self.imageEQueue.get(False)
except Queue.Empty:
return
#ll("△ "+imageE["pathLocal"],"gray")
reqE = extractLinkData(imageE["url"],"GET",{},{"Referer":imageE["referer"],"Connection":"keep-alive",})
text_file = open(imageE["pathLocal"],"w")
text_file.write(reqE["txt"])
text_file.close()
ll("◯ "+imageE["pathLocal"],"g")
class Proc(threading.Thread):
def __init__(self,getFxn):
super(Proc,self).__init__()
self.getFxn = getFxn
def run(self):
global p
while True:
jobE = self.getFxn()
if jobE == None:
return
job = globals()[jobE["classnameS"]](**jobE["argO"])
job.run()
# scan for each image
# ----------------------------------------------------------------------------------------------------------------------
# multithreaded execute
tA = []
def stage1Fxn():
global p
try:
return p["jobEQueue_stage1"].get(False)
except Queue.Empty:
return None
for i in xrange(p["threadC"]):
t = Proc(getFxn=stage1Fxn)
t.daemon = True
tA.append(t)
t.start()
for t in tA:
t.join()
tA = []
# read through the queue (without, in the end, modifying it) to count the number of images to download
tempQueue = Queue.Queue()
imageN = 0
while True:
try:
jobE = p["jobEQueue_stage2"].get(False)
except Queue.Empty:
break
imageN += jobE["argO"]["imageEQueue"].qsize()
tempQueue.put(jobE)
while True:
try:
jobE = tempQueue.get(False)
except Queue.Empty:
break
p["jobEQueue_stage2"].put(jobE)
# download each image
# ----------------------------------------------------------------------------------------------------------------------
# multithreaded execute
# go in reverse, for historical reason:
# if the script gets interrupted, then when it's next executed, it won't [as-often] improperly trigger the stopOnFoundF flag
# this is no long necessary, but it seems like good form
ll("Downloading "+str(imageN)+" images...","m")
tA = []
def stage2Fxn():
global p
try:
return p["jobEQueue_stage2"].get(False)
except Queue.Empty:
return None
for i in xrange(p["threadC"]):
t = Proc(getFxn=stage2Fxn)
t.daemon = True
tA.append(t)
t.start()
for t in tA:
t.join()
tA = []
ll(os.linesep+"END","c")