forked from chanwooh/Nextdoor-Script
-
Notifications
You must be signed in to change notification settings - Fork 0
/
html_scraper.py
126 lines (102 loc) · 3.45 KB
/
html_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Python Version 3.8.0
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import sys
import time
import csv
from lxml import html
import requests
import json
from dotenv import load_dotenv
import os
# Load ability to use .env
load_dotenv()
# Open local html file
with open(os.getenv("selenium_html"), "r", encoding="utf-8") as f:
page = f.read()
tree = html.fromstring(page)
postNodes = tree.xpath('//div[contains(@class, "css-aqcial")]')
# Iterate over each post node that has an author to get data in an organized fashion
author_path = './/a[@class="_19bqJaQo dBEpfhFh"]/text()'
location_path = './/span/*[contains(@class, "post-byline-cursor")]/text()'
title_path = './/*[@class="content-title-container"]/a/text()'
category_path = './/div[@class="content-scope-line"]/span/a/text()'
date_path = './/a[@class="post-byline-redesign"]/text()'
post_content_path = './/p[contains(@class, "content-body")]//span[@class="Linkify"]/span/text()'
num_replies_path = './/span[contains(@class, "post-comment-count-text")]/text()'
reply_author_path = './/a[@class="comment-detail-author-name"]/text()'
reply_content_path = './/span[@class="Linkify"]/span/text()'
posts = [(post.xpath(author_path),
post.xpath(location_path),
post.xpath(title_path),
post.xpath(category_path),
post.xpath(date_path),
post.xpath(post_content_path),
post.xpath(num_replies_path),
post.xpath(reply_author_path),
post.xpath(reply_content_path),
post) for post in postNodes if post.xpath(author_path) != []]
# Create CSV Writer for first document (Posts)
ofile = open('posts.csv', "w", encoding='utf-8')
writer = csv.writer(ofile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow(["Post Number", "Author", "Location", "Title", "Category", "Date", "Content", "Number of Replies"])
# Create CSV Writer for second document (Replies)
rfile = open('replies.csv', "w", encoding='utf-8')
rWriter = csv.writer(rfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
rWriter.writerow(["From Post", "Author", "Reply Content"])
post_counter = 1
# Output to csv files
for post in posts:
# Posts
author = post[0][0].encode('utf8').decode('utf8')
location = "Unlisted"
try:
location = post[1][0].encode('utf8').decode('utf8')
except:
pass
title = "No Title"
try:
title = post[2][0].encode('utf8').decode('utf8')
except:
pass
category = "No Category"
category_list = ""
try:
category = post[3][0].encode('utf8').decode('utf8')
except:
category_list = post[9].xpath('.//span[@class="content-scope-line-hood-link js-scope-line-hoods"]/text()')
try:
category = category_list[0]
except:
pass
date = "No Date"
try:
date = post[4][0].encode('utf8').decode('utf8')
except:
pass
content = "No Content"
try:
content = post[5][0].encode('utf8').decode('utf8')
except:
pass
numReplies = "0 Comments"
try:
numReplies = post[6][0].encode('utf8').decode('utf8')
except:
pass
if numReplies == "Comment":
numReplies = "0 Comments"
writer.writerow([post_counter, author, location, title, category, date, content, numReplies])
# Replies
# Iterate through all replies with an author (post[7])
for count in range(0, len(post[7])):
try:
name = post[7][count].encode('utf-8').decode('utf8')
except Exception:
pass
try:
reply = post[8][count].encode('utf-8').decode('utf8')
except Exception:
pass
rWriter.writerow([post_counter, name, reply])
post_counter += 1