Skip to content

Commit

Permalink
analyze.py: special case: no raw star TS, but resampled
Browse files Browse the repository at this point in the history
Then do not error out with

230930-23:13:52.237 INFO: Parse (raw) stargazer time series CSV: stars-raw.csv
Traceback (most recent call last):
...
FileNotFoundError: [Errno 2] No such file or directory: 'stars-raw.csv'

This was an oversight.

This is the logical counterpart of fetch.py not
fetching the raw series when the count hasn't changed.
  • Loading branch information
jgehrcke committed Sep 30, 2023
1 parent 188cc59 commit 09ab052
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 25 deletions.
81 changes: 56 additions & 25 deletions analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -1397,34 +1397,64 @@ def symlog_or_lin(df, colname, threshold):


def read_stars_over_time_from_csv() -> pd.DataFrame:
if not ARGS.stargazer_ts_inpath:

df_stargazers_complete = pd.DataFrame({"time": [], "stars_cumulative": []})

if not ARGS.stargazer_ts_inpath and not ARGS.stargazer_ts_snapshot_inpath:
log.info("stargazer_ts_inpath not provided, return emtpy df")
return pd.DataFrame()
return df_stargazers_complete

log.info("Parse (raw) stargazer time series CSV: %s", ARGS.stargazer_ts_inpath)
raw_ts_latest_datetime = None

df_40klim = pd.read_csv( # type: ignore
ARGS.stargazer_ts_inpath,
index_col=["time_iso8601"],
date_parser=lambda col: pd.to_datetime(col, utc=True),
)
if os.path.exists(ARGS.stargazer_ts_inpath):
log.info("Parse (raw) stargazer time series CSV: %s", ARGS.stargazer_ts_inpath)

df_40klim.index.rename("time", inplace=True)
log.info("stars_cumulative, raw data: %s", df_40klim["stars_cumulative"])
df_40klim = pd.read_csv( # type: ignore
ARGS.stargazer_ts_inpath,
index_col=["time_iso8601"],
date_parser=lambda col: pd.to_datetime(col, utc=True),
)

if not len(df_40klim):
log.info("CSV file did not contain data, return empty df")
return df_40klim
df_40klim.index.rename("time", inplace=True)
log.info("stars_cumulative, raw data: %s", df_40klim["stars_cumulative"])

if not len(df_40klim):
log.info("CSV file did not contain data, return empty df")
return df_40klim

raw_ts_latest_datetime = df_40klim.index[-1]
# log.info("df_40klim.index: %s", df_40klim.index)
# log.info("raw_ts_latest_datetime: %s", raw_ts_latest_datetime)
raw_ts_latest_datetime = df_40klim.index[-1]

# Just to reiterate, this is expected to be the 'raw' API-provided
# timeseries, including each individual stargazer event up to 40k. It may
# not be reasonable to plot this as-is, depending on density and overall
# amount of data points.
df_stargazers_complete = df_40klim
# log.info("df_40klim.index: %s", df_40klim.index)
# log.info("raw_ts_latest_datetime: %s", raw_ts_latest_datetime)

# Just to reiterate, this is expected to be the 'raw' API-provided
# timeseries, including each individual stargazer event up to 40k. It may
# not be reasonable to plot this as-is, depending on density and overall
# amount of data points.
df_stargazers_complete = df_40klim

elif os.path.exists(ARGS.stargazer_ts_resampled_outpath):
# This is an interesting tidbit; no 'raw' series was provided, but a
# previously written resampled timeseries. Read this, assuming it
# reflects a downsampled version of the first 40k stargazers.
log.info(
"No raw star TS provided. Parse (previously resampled) stargazer time series CSV: %s",
ARGS.stargazer_ts_resampled_outpath,
)
df_resampled = pd.read_csv( # type: ignore
ARGS.stargazer_ts_resampled_outpath,
index_col=["time_iso8601"],
date_parser=lambda col: pd.to_datetime(col, utc=True),
)
df_resampled.index.rename("time", inplace=True)
log.info(
"stars_cumulative, previously resampled: %s",
df_resampled["stars_cumulative"],
)

# Here, the variable name becomes misleading.
raw_ts_latest_datetime = df_resampled.index[-1]
df_stargazers_complete = df_resampled

# When ending up here: there is at least one stargazer (fast exit above for
# case 0). Note: the existence of the file `stargazer_ts_snapshot_inpath`
Expand All @@ -1451,9 +1481,10 @@ def read_stars_over_time_from_csv() -> pd.DataFrame:
# Defensive: select only those data points that are newer than those in
# df_40klim.
# log.info("df_snapshots_beyond40k.index: %s", df_snapshots_beyond40k.index)
df_snapshots_beyond40k = df_snapshots_beyond40k[
df_snapshots_beyond40k.index > raw_ts_latest_datetime
]
if raw_ts_latest_datetime is not None:
df_snapshots_beyond40k = df_snapshots_beyond40k[
df_snapshots_beyond40k.index > raw_ts_latest_datetime
]

# Is at least one data point left?
if len(df_snapshots_beyond40k):
Expand Down Expand Up @@ -1494,7 +1525,7 @@ def read_stars_over_time_from_csv() -> pd.DataFrame:
df_stargazers_for_plot = df_stargazers_complete

# Many data points? Downsample, for plotting.
if len(df_stargazers_complete) > 50:
if len(df_stargazers_for_plot) > 50:
df_stargazers_for_plot = downsample_series_to_N_points(
df_stargazers_complete, "stars_cumulative"
)
Expand Down
13 changes: 13 additions & 0 deletions tests/analyze.bats
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,16 @@ setup() {
assert_exist $BATS_TEST_TMPDIR/outdir/report_for_pdf.html
}

@test "analyze.py: snapshots: some, vcagg: no, stars: none, forks: some, starsnaps: yes" {
run python analyze.py owner/repo tests/data/A/snapshots \
--resources-directory=resources \
--output-directory $BATS_TEST_TMPDIR/outdir \
--outfile-prefix "" \
--stargazer-ts-resampled-outpath stargazers-rs.csv \
--views-clones-aggregate-inpath tests/data/A/views_clones_aggregate.csv \
--fork-ts-inpath=tests/data/A/forks.csv \
--stargazer-ts-inpath=tests/data/A/stars-noexist.csv \
--stargazer-ts-snapshot-inpath=tests/data/A/stars-snapshots.csv
[ "$status" -eq 0 ]
assert_exist $BATS_TEST_TMPDIR/outdir/report_for_pdf.html
}

0 comments on commit 09ab052

Please sign in to comment.