Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check for numbers in sentences can be switched off #100

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ MANIFEST

# Per-project virtualenvs
.virtualenv/
venv/
20 changes: 20 additions & 0 deletions src/corporacreator/argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,17 @@ def _check_positive(value):
return ivalue


def _check_boolean(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')


def parse_args(args):
"""Parse command line parameters

Expand Down Expand Up @@ -89,4 +100,13 @@ def parse_args(args):
help="Maximum number of times a sentence can appear in a corpus.",
dest="duplicate_sentence_count",
)
parser.add_argument(
"-c",
"--check-for-digits",
default=True,
required=False,
type=_check_boolean,
help="Check sentences for digits",
dest="check_for_digits",
)
return parser.parse_args(args)
8 changes: 4 additions & 4 deletions src/corporacreator/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
_logger = logging.getLogger(__name__)


def common_wrapper(sentence, up_votes, down_votes):
is_valid, sentence = common(sentence)
if False == is_valid:
def common_wrapper(sentence, up_votes, down_votes, check_for_digits):
is_valid, sentence = common(sentence, check_for_digits)
if not is_valid:
up_votes = 0
down_votes = 2
return pd.Series([sentence, up_votes, down_votes])
Expand Down Expand Up @@ -42,7 +42,7 @@ def create(self):
corpora_data = self._parse_tsv()
corpora_data[["sentence", "up_votes", "down_votes"]] = corpora_data[
["sentence", "up_votes", "down_votes"]
].swifter.apply(func=lambda arg: common_wrapper(*arg), axis=1)
].swifter.apply(func=lambda arg: common_wrapper(*arg, self.args.check_for_digits), axis=1)
if self.args.langs:
# check if all languages provided at command line are actually
# in the clips.tsv file, if not, throw error
Expand Down
4 changes: 2 additions & 2 deletions src/corporacreator/preprocessors/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _strip_string(sentence):
return u''.join([c for c in sentence if unicodedata.category(c) in allowed_categories])


def common(sentence):
def common(sentence, check_for_digits):
"""Cleans up the passed sentence in a language independent manner, removing or reformatting invalid data.

Args:
Expand All @@ -88,7 +88,7 @@ def common(sentence):
sentence = (' ').join(sentence.split())
# TODO: Clean up data in a language independent manner
# If the sentence contains digits reject it
if _has_digit(sentence):
if check_for_digits and _has_digit(sentence):
is_valid = False
# If the sentence is blank reject it
if not sentence.strip():
Expand Down