-
Notifications
You must be signed in to change notification settings - Fork 0
/
cvinfodraft.py
141 lines (123 loc) · 6.33 KB
/
cvinfodraft.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import requests
from bs4 import BeautifulSoup
from lxml import html
import re
# List of directory names to ignore
ignore_list = ['ignore_dir1', 'ignore_dir2']
def search_comicvine(series_name):
print(f"Searching ComicVine for series: {series_name}")
# Extract volume number if present, otherwise consider it as Volume 1
match = re.search(r'V(\d+)', series_name)
volume_number = match.group(1) if match else '1'
print(f"Volume number: {volume_number}")
search_query = series_name.replace(' ', '%20')
# Function to perform the search on a specific page
def perform_search(page):
search_url = f"https://comicvine.gamespot.com/search/?q={search_query}&page={page}"
print(f"Search URL: {search_url}")
response = requests.get(search_url)
soup = BeautifulSoup(response.text, 'html.parser')
tree = html.fromstring(str(soup))
# List of paths to check for volume information
paths = [
f'/html/body/div[1]/div/div[2]/div/div/ul[1]/li[{i}]/a' for i in range(1, 16)
]
# Check each path for volume information
for path in paths:
print(f"Checking path: {path}")
results = tree.xpath(path)
for result in results:
url = result.xpath('./@href')[0]
volume_info = result.xpath('.//p[2]/text()')
volume_info = [info.strip() for info in volume_info] # Strip extra spaces
print(f"Found volume info: {volume_info}")
if volume_info and f"Volume {volume_number}" in volume_info[0]:
print(f"Found URL: {url} for series: {series_name}")
return url, "Volume Search"
return None, None
# Perform the search on the first page
url, search_type = perform_search(1)
if url:
return url, search_type
# Perform the search on the second page if no match found on the first page
url, search_type = perform_search(2)
if url:
return url, search_type
# If no match found, search for "Volume YEAR"
year_match = re.search(r'\((\d{4})\)', series_name)
if year_match:
year = year_match.group(1)
print(f"Year found: {year}")
# Perform the search for "Volume YEAR"
search_url = f"https://comicvine.gamespot.com/search/?q={search_query}"
response = requests.get(search_url)
soup = BeautifulSoup(response.text, 'html.parser')
tree = html.fromstring(str(soup))
# List of paths to check for volume information
paths = [
f'/html/body/div[1]/div/div[2]/div/div/ul[1]/li[{i}]/a' for i in range(1, 16)
]
for path in paths:
print(f"Checking path for year: {path}")
results = tree.xpath(path)
for result in results:
url = result.xpath('./@href')[0]
volume_info = result.xpath('.//p[1]/span/text()')
volume_info = [info.strip() for info in volume_info] # Strip extra spaces
print(f"Found volume info: {volume_info}")
if volume_info and re.match(rf"Volume\s+{year}", volume_info[0]):
print(f"Found URL: {url} for series: {series_name}")
return url, "Year Search"
print(f"No URL found for series: {series_name}")
return None, None
def create_cvinfo_file(directory, no_url_dirs, found_urls):
series_name = os.path.basename(directory)
print(f"Processing directory: {directory}")
url, search_type = search_comicvine(series_name)
if url:
full_url = f"https://comicvine.gamespot.com{url}"
cvinfo_path = os.path.join(directory, 'cvinfo')
with open(cvinfo_path, 'w') as file:
file.write(full_url)
print(f"cvinfo file created with URL: {full_url}")
found_urls.append((series_name, full_url, search_type))
else:
print(f"No URL found for series: {series_name}")
no_url_dirs.append(series_name)
def process_immediate_subdirectories(root_directory):
print(f"Processing root directory: {root_directory}")
no_url_dirs = []
found_urls = []
for subdir in next(os.walk(root_directory))[1]:
# Skip directories in the ignore list
if subdir in ignore_list:
print(f"Skipping directory: {subdir}")
continue
create_cvinfo_file(os.path.join(root_directory, subdir), no_url_dirs, found_urls)
# Write the list of directories with no URL found to a .txt file
no_url_file_path = os.path.join(root_directory, 'no_url_dirs.txt')
with open(no_url_file_path, 'w') as file:
for dir_name in no_url_dirs:
file.write(f"{dir_name}\n")
print(f"List of directories with no URL found written to: {no_url_file_path}")
# Write the list of successfully found URLs to a .html file with two columns
found_urls_file_path = os.path.join(root_directory, 'found_urls.html')
with open(found_urls_file_path, 'w') as file:
file.write("<html><body><table border='1'>\n")
file.write("<tr><th>Volume Search</th><th>Year Search</th></tr>\n")
volume_search_items = [f"<li>{dir_name}: <a href=\"{url}\">{url}</a></li>" for dir_name, url, search_type in found_urls if search_type == "Volume Search"]
year_search_items = [f"<li>{dir_name}: <a href=\"{url}\">{url}</a></li>" for dir_name, url, search_type in found_urls if search_type == "Year Search"]
max_length = max(len(volume_search_items), len(year_search_items))
for i in range(max_length):
file.write("<tr>")
file.write(f"<td>{volume_search_items[i] if i < len(volume_search_items) else ''}</td>")
file.write(f"<td>{year_search_items[i] if i < len(year_search_items) else ''}</td>")
file.write("</tr>\n")
file.write("</table></body></html>\n")
print(f"List of successfully found URLs written to: {found_urls_file_path}")
# Get the directory in which the script resides
script_directory = os.path.dirname(os.path.abspath(__file__))
print(f"Script directory: {script_directory}")
# Process only the immediate subdirectories in the script's directory
process_immediate_subdirectories(script_directory)