-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
142 lines (119 loc) · 4.81 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import time
import requests
import urllib.parse
import socket
from bs4 import BeautifulSoup
from collections import deque
def resolve_domain(domain):
try:
ip = socket.gethostbyname(domain)
return ip
except socket.gaierror:
return "Unable to resolve"
def find_connections(url, base_url):
connections = set()
connection_info = []
try:
response = requests.get(url, allow_redirects=True)
info = {
"URL": url,
"HTTP Response Headers": {header: value for header, value in response.headers.items()},
"Resolved IP Address": resolve_domain(urllib.parse.urlparse(response.url).netloc),
"Connections": set()
}
soup = BeautifulSoup(response.content, 'html.parser')
# Find links from HTML tags
for tag in soup.find_all(['a', 'img', 'script', 'link']):
href = tag.get('href')
src = tag.get('src')
if href:
connections.add(href)
parsed_url = urllib.parse.urlparse(href)
domain = parsed_url.netloc
path = parsed_url.path
resolved_ip = resolve_domain(domain)
connection_info.append({"Domain": domain, "Path": path, "Resolved IP": resolved_ip})
if src:
connections.add(src)
parsed_url = urllib.parse.urlparse(src)
domain = parsed_url.netloc
path = parsed_url.path
resolved_ip = resolve_domain(domain)
connection_info.append({"Domain": domain, "Path": path, "Resolved IP": resolved_ip})
print("HTTP Response Headers:")
for header, value in response.headers.items():
print(f"{header}: {value}")
print("IP Address:", info["Resolved IP Address"])
print("\n-----")
print("Connections:")
time.sleep(0.6)
for info in connection_info:
print("Domain:", info["Domain"])
print("Path:", info["Path"])
print("Resolved IP:", info["Resolved IP"])
print()
except requests.exceptions.RequestException as e:
print("Error:", e)
return connections, info
def crawl_site(base_url):
visited = set()
queue = deque([base_url])
base_domain = urllib.parse.urlparse(base_url).netloc # Extract the domain of the base URL
report = []
while queue:
url = queue.popleft()
if url in visited:
continue
visited.add(url)
try:
response = requests.get(url)
status_code = response.status_code
info = {
"URL": url,
"Status Code": status_code
}
print("=INFO=======================================")
print(f"URL: {url}, Status Code: {status_code}")
if response.status_code == 200:
connections, conn_info = find_connections(url, base_url)
info.update(conn_info)
report.append(info)
soup = BeautifulSoup(response.content, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
next_url = urllib.parse.urljoin(url, link['href'])
parsed_next_url = urllib.parse.urlparse(next_url)
# Check if the domain of the next URL matches the domain of the base URL
if parsed_next_url.netloc == base_domain:
queue.append(next_url)
else:
report.append(info)
except requests.RequestException as e:
print(f"Error accessing {url}: {e}")
print("-" * 50)
return report
def make_report(report, filename):
with open(filename, "w") as file:
for info in report:
file.write("URL: {}\n".format(info["URL"]))
file.write("Status Code: {}\n".format(info.get("Status Code", "N/A")))
if "HTTP Response Headers" in info:
file.write("HTTP Response Headers:\n")
for header, value in info["HTTP Response Headers"].items():
file.write(f"{header}: {value}\n")
file.write("Resolved IP Address: {}\n".format(info.get("Resolved IP Address", "N/A")))
if "Connections" in info:
file.write("Connections:\n")
for connection in info["Connections"]:
file.write(f" {connection}\n")
file.write("\n")
if __name__ == "__main__":
try:
url = input("Enter the URL to check: ")
report_filename = input("Enter the filename to save the report: ")
report = crawl_site(url)
make_report(report, report_filename)
except KeyboardInterrupt:
print("ended")
except Exception as e:
print(f"error {e}")