Skip to content

Commit

Permalink
black
Browse files Browse the repository at this point in the history
  • Loading branch information
Smattr committed Feb 2, 2024
1 parent a1af7f4 commit 18a40a0
Show file tree
Hide file tree
Showing 11 changed files with 314 additions and 222 deletions.
17 changes: 11 additions & 6 deletions feeders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,19 @@ def add(self, name, item):
def __iter__(self):
raise NotImplementedError


class Entry(object):
def __init__(self, name=None, subject=None, content=None, date=None, html=False, files=None):
self.name = name or ''
self.subject = subject or ''
self.content = content or ''
def __init__(
self, name=None, subject=None, content=None, date=None, html=False, files=None
):
self.name = name or ""
self.subject = subject or ""
self.content = content or ""
self.date = date
self.html = html
self.files = files or []


def download(url):
RETRIES = 3
for i in range(RETRIES):
Expand All @@ -33,14 +37,15 @@ def download(url):
except urllib.error.URLError as e:
if i == RETRIES - 1:
raise
if getattr(e, 'code', None) == 403:
if getattr(e, "code", None) == 403:
# Some sites explicitly block urllib to prevent crawling (e.g.
# Microsoft). Since we're not really a crawler, sidestep this by
# twiddling our user agent.
request = urllib.request.Request(url, headers={'User-Agent':''})
request = urllib.request.Request(url, headers={"User-Agent": ""})
response = urllib.request.urlopen(request)
return response.read()


# Sentinel class used by feeders to ask the main logic to write back state to
# disk. Feeders should use this following processing of each feed. The purpose
# of this is to minimise the resending of entries when a feeder is interrupted
Expand Down
27 changes: 16 additions & 11 deletions feeders/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,30 @@
class Feeder(base.Feeder):
def __iter__(self):
for n, i in self.feeds.items():
assert 'url' in i
url = i['url']
assert "url" in i
url = i["url"]
if url in self.resource:
old = self.resource[url].splitlines()
oldurl = url
else:
old = []
oldurl = '/dev/null'
oldurl = "/dev/null"
try:
new = bs4.BeautifulSoup(base.download(url).strip(), 'html.parser').get_text().splitlines()
new = (
bs4.BeautifulSoup(base.download(url).strip(), "html.parser")
.get_text()
.splitlines()
)
except Exception as e:
yield Exception(f'Error while loading {url}: {e}')
yield Exception(f"Error while loading {url}: {e}")
continue
lines = list(difflib.unified_diff(old, new, fromfile=oldurl,
tofile=url, lineterm=''))
if i.get('ignore_white_space', 'yes').lower() == 'yes':
lines = list(
difflib.unified_diff(old, new, fromfile=oldurl, tofile=url, lineterm="")
)
if i.get("ignore_white_space", "yes").lower() == "yes":
lines = list(diffcommon.suppress_whitespace(lines))
if len(lines) > 2:
content = '\n'.join(lines)
yield base.Entry(n, f'{url} changes', content)
self.resource[url] = '\n'.join(new)
content = "\n".join(lines)
yield base.Entry(n, f"{url} changes", content)
self.resource[url] = "\n".join(new)
yield base.SyncRequest()
13 changes: 7 additions & 6 deletions feeders/diffcommon.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
def suppress_whitespace(lines):
'''
"""
Remove hunks from a unified diff that only contain white space changes.
'''
"""

# States for following state machine.
IDLE, IN_HUNK = list(range(2))
Expand All @@ -15,7 +15,7 @@ def suppress_whitespace(lines):
if state == IDLE:
assert len(accumulated) == 0

if line.startswith('@@'):
if line.startswith("@@"):
# Encountered a new hunk.
accumulated = [line]
state = IN_HUNK
Expand All @@ -29,15 +29,16 @@ def suppress_whitespace(lines):
assert state == IN_HUNK
assert len(accumulated) > 0

if (line.startswith('+') or line.startswith('-')) and \
line[1:].strip() != '':
if (line.startswith("+") or line.startswith("-")) and line[
1:
].strip() != "":
# This is a non-empty change line. Decide to keep this hunk.
for a in accumulated:
yield a
accumulated = []
state = IDLE

elif line.startswith('@@'):
elif line.startswith("@@"):
# Encountered a new hunk without finding anything interesting in
# the current hunk. Ditch the current hunk (the prior contents
# of `accumulated`.
Expand Down
61 changes: 35 additions & 26 deletions feeders/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,21 @@


def run(cmd, cwd):
p = subprocess.Popen(cmd, cwd=cwd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
p = subprocess.Popen(cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stdout = stdout.decode('utf-8', 'replace')
stderr = stderr.decode('utf-8', 'replace')
stdout = stdout.decode("utf-8", "replace")
stderr = stderr.decode("utf-8", "replace")
return p.returncode, stdout.strip(), stderr.strip()

class Feeder(base.Feeder):

class Feeder(base.Feeder):
def __iter__(self):
for n, i in self.feeds.items():

assert 'url' in i
remote = i['url']
assert "url" in i
remote = i["url"]

branch = i.get('branch', 'master')
branch = i.get("branch", "master")

state = self.resource.get((remote, branch))

Expand All @@ -34,10 +33,11 @@ def __iter__(self):
if state is None:
# This is the first time we've encountered this repository. We
# need to clone it.
ret, _, stderr = run(['git', 'clone', '--bare', '--branch',
branch, remote, '.'], tmp)
ret, _, stderr = run(
["git", "clone", "--bare", "--branch", branch, remote, "."], tmp
)
if ret != 0:
yield Exception(f'failed to clone {remote}:\n{stderr}')
yield Exception(f"failed to clone {remote}:\n{stderr}")
shutil.rmtree(tmp)
continue

Expand All @@ -53,19 +53,23 @@ def __iter__(self):
t.extractall(tmp)

# Update the history in the working directory.
ret, _, stderr = run(['git', 'fetch', remote, f'{branch}:{branch}'], tmp)
ret, _, stderr = run(
["git", "fetch", remote, f"{branch}:{branch}"], tmp
)
if ret != 0:
yield Exception('failed to update temporary working '
f'directory for {remote}:\n{stderr}')
yield Exception(
"failed to update temporary working "
f"directory for {remote}:\n{stderr}"
)
shutil.rmtree(tmp)
continue

# Now retrieve the log and look for new commits.
ret, stdout, stderr = run(['git', 'log', '--reverse', '--pretty=%H',
branch], tmp)
ret, stdout, stderr = run(
["git", "log", "--reverse", "--pretty=%H", branch], tmp
)
if ret != 0:
yield Exception('failed to retrieve Git log of '
f'{remote}:\n{stderr}')
yield Exception("failed to retrieve Git log of " f"{remote}:\n{stderr}")
shutil.rmtree(tmp)
continue

Expand All @@ -76,17 +80,22 @@ def __iter__(self):
if last_commit is None or seen_last_commit:
# This is a new commit.

ret, summary, stderr = run(['git', 'log', '-n', '1',
'--format=%s', commit], tmp)
ret, summary, stderr = run(
["git", "log", "-n", "1", "--format=%s", commit], tmp
)
if ret != 0:
yield Exception('failed to retrieve summary for Git '
f'commit {commit} of {remote}:\n{stderr}')
yield Exception(
"failed to retrieve summary for Git "
f"commit {commit} of {remote}:\n{stderr}"
)
continue

ret, diff, stderr = run(['git', 'show', commit], tmp)
ret, diff, stderr = run(["git", "show", commit], tmp)
if ret != 0:
yield Exception('failed to retrieve diff for Git '
f'commit {commit} of {remote}:\n{stderr}')
yield Exception(
"failed to retrieve diff for Git "
f"commit {commit} of {remote}:\n{stderr}"
)
continue

yield base.Entry(n, summary, diff)
Expand All @@ -101,7 +110,7 @@ def __iter__(self):
# bother compressing it because the resources as a whole are
# compressed.
buffer = io.BytesIO()
with tarfile.open(fileobj=buffer, mode='w') as t:
with tarfile.open(fileobj=buffer, mode="w") as t:
for item in Path(tmp).iterdir():
t.add(item, item.name)
data = buffer.getvalue()
Expand Down
23 changes: 12 additions & 11 deletions feeders/htmldiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,26 @@
class Feeder(base.Feeder):
def __iter__(self):
for n, i in self.feeds.items():
assert 'url' in i
url = i['url']
assert "url" in i
url = i["url"]
if url in self.resource:
old = self.resource[url].splitlines()
oldurl = url
else:
old = []
oldurl = '/dev/null'
oldurl = "/dev/null"
try:
new = base.download(url).decode('utf-8', 'replace').strip().splitlines()
new = base.download(url).decode("utf-8", "replace").strip().splitlines()
except Exception as e:
yield Exception(f'Error while loading {url}: {e}')
yield Exception(f"Error while loading {url}: {e}")
continue
lines = list(difflib.unified_diff(old, new, fromfile=oldurl,
tofile=url, lineterm=''))
if i.get('ignore_white_space', 'yes').lower() == 'yes':
lines = list(
difflib.unified_diff(old, new, fromfile=oldurl, tofile=url, lineterm="")
)
if i.get("ignore_white_space", "yes").lower() == "yes":
lines = list(diffcommon.suppress_whitespace(lines))
if len(lines) > 2:
content = '\n'.join(lines)
yield base.Entry(n, f'{url} changes', content)
self.resource[url] = '\n'.join(new)
content = "\n".join(lines)
yield base.Entry(n, f"{url} changes", content)
self.resource[url] = "\n".join(new)
yield base.SyncRequest()
27 changes: 14 additions & 13 deletions feeders/jumpthrough.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
class Feeder(base.Feeder):
def __iter__(self):
for n, i in self.feeds.items():
assert 'url' in i
url = i['url']
assert "url" in i
url = i["url"]
data = self.resource.get(url, {})
if isinstance(data, dict): # new scheme
etag = data.get('etag')
modified = data.get('modified')
seen = data.get('seen', [])[:]
else: # old scheme
if isinstance(data, dict): # new scheme
etag = data.get("etag")
modified = data.get("modified")
seen = data.get("seen", [])[:]
else: # old scheme
assert isinstance(data, list)
etag = None
modified = None
Expand All @@ -26,18 +26,19 @@ def __iter__(self):
if id not in seen:
try:
data = base.download(e.link)
yield base.Entry(n, e.title, data, \
date=rsscommon.get_date(e), html=True)
yield base.Entry(
n, e.title, data, date=rsscommon.get_date(e), html=True
)
except urllib.error.HTTPError:
# Suppress 404s from broken links.
pass
seen.append(id)
# save in new scheme
self.resource[url] = {
'etag':etag,
'modified':modified,
'seen':seen,
"etag": etag,
"modified": modified,
"seen": seen,
}
yield base.SyncRequest()
except Exception as e:
yield Exception(f'Error from feed {n}: {e}')
yield Exception(f"Error from feed {n}: {e}")
44 changes: 26 additions & 18 deletions feeders/rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
class Feeder(base.Feeder):
def __iter__(self):
for n, i in self.feeds.items():
assert 'url' in i
url = i['url']
assert "url" in i
url = i["url"]
data = self.resource.get(url, {})
if isinstance(data, dict): # new scheme
etag = data.get('etag')
modified = data.get('modified')
seen = data.get('seen', [])[:]
else: # old scheme
if isinstance(data, dict): # new scheme
etag = data.get("etag")
modified = data.get("modified")
seen = data.get("seen", [])[:]
else: # old scheme
assert isinstance(data, list)
etag = None
modified = None
Expand All @@ -24,21 +24,29 @@ def __iter__(self):
id = rsscommon.get_id(e)
if id not in seen:
links = rsscommon.get_links(e)
yield base.Entry(n, e.title,
'<p><b>%(title)s</b><br/><font size="-1">%(links)s</font></p>%(content)s' % {
'title':rsscommon.get_title(e),
'links':'<br/>'.join(f'<a href="{x}">{x}</a>' for x in links),
'content':rsscommon.get_content(e),
}, date=rsscommon.get_date(e), html=True)
yield base.Entry(
n,
e.title,
'<p><b>%(title)s</b><br/><font size="-1">%(links)s</font></p>%(content)s'
% {
"title": rsscommon.get_title(e),
"links": "<br/>".join(
f'<a href="{x}">{x}</a>' for x in links
),
"content": rsscommon.get_content(e),
},
date=rsscommon.get_date(e),
html=True,
)
seen.append(id)
except Exception as e:
yield Exception(f'Error from feed {n}: {e}')
yield Exception(f"Error from feed {n}: {e}")
# save in new scheme
self.resource[url] = {
'etag':etag,
'modified':modified,
'seen':seen,
"etag": etag,
"modified": modified,
"seen": seen,
}
yield base.SyncRequest()
except Exception as e:
yield Exception(f'Error from feed {n}: {e}')
yield Exception(f"Error from feed {n}: {e}")
Loading

0 comments on commit 18a40a0

Please sign in to comment.