Skip to content

Commit

Permalink
Update script for new dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
robinhad committed Mar 11, 2021
1 parent a6cdaf5 commit 605556b
Showing 1 changed file with 20 additions and 5 deletions.
25 changes: 20 additions & 5 deletions scripts/import_ukrainian.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
print_import_report,
)
from ds_ctcdecoder import Alphabet
import re

FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
SAMPLE_RATE = 16000
Expand Down Expand Up @@ -108,6 +109,12 @@ def one_sample(sample):
return (counter, rows)


def convert_transcript(transcript):
transcript = re.sub("[а-я](')[а-я]", "’", transcript)
transcript = transcript.replace("-", " ")
return transcript.strip()


def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
# iterate over all data lists and write converted version near them
speaker_iterator = 1
Expand All @@ -124,17 +131,24 @@ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_cha
os.path.dirname(subdir), "wav")
file_dict = dict()
for row in file.readlines():
file_name, transcript = row.replace(
" \n", "").split(" ", 1)
if row.isspace():
continue
splitted_row = row.replace("\n", "").replace(
" wav ", ".wav ").split(" ", 1)
if len(splitted_row) != 2:
continue
file_name, transcript = splitted_row
if file_name.endswith(".wav"):
pass
elif file_name.endswith(".mp3"):
pass
elif file_name.find(".") == -1:
file_name += ".wav"

file_name = os.path.join(file_folder, file_name)
file_dict[file_name] = transcript
if file_name.startswith("/"):
file_name = file_name[1::]
file_name = os.path.join(dataset_dir, file_name)
file_dict[file_name] = convert_transcript(transcript)

file.close()

Expand Down Expand Up @@ -176,7 +190,8 @@ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_cha
print("Writing CSV file for DeepSpeech.py as: ", output_csv)
writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
writer.writeheader()
bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
bar = progressbar.ProgressBar(
max_value=len(rows), widgets=SIMPLE_BAR)
for filename, file_size, transcript, speaker in bar(rows):
if space_after_every_character:
writer.writerow(
Expand Down

0 comments on commit 605556b

Please sign in to comment.