-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.sh
77 lines (56 loc) · 3.08 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Copyright (c) 2013-2014 Lingpeng Kong
# All Rights Reserved.
#
# This file is part of TweeboParser 1.0.
#
# TweeboParser 1.0 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# TweeboParser 1.0 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with TweeboParser 1.0. If not, see <http://www.gnu.org/licenses/>.
# This script runs the whole pipeline of TweeboParser. It reads from a raw text input
# and produce the CoNLL format dependency parses as its output (It calls all necessary
# component, such as POS tagger, along the way).
# Get the path of the components of TweeboParser
ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
SCRIPT_DIR="${ROOT_DIR}/scripts"
TAGGER_DIR="${ROOT_DIR}/ark-tweet-nlp-0.3.2"
PARSER_DIR="${ROOT_DIR}/TBParser"
TOKENSEL_DIR="${ROOT_DIR}/token_selection"
MODEL_DIR="${ROOT_DIR}/pretrained_models"
WORKING_DIR="${ROOT_DIR}/working_dir"
# To run the parser:
if [ "$#" -ne 1 ]; then
echo "Usage: ./run.sh [path_to_raw_input_file_one_sentence_a_line]"
else
# Starting point:
# -- Raw text tweets, one line per tweet.
INPUT_FILE=$1
# --> Run Twitter POS tagger on top of it. (Tokenization and Converting to CoNLL format along the way.)
${SCRIPT_DIR}/tokenize_and_tag.sh ${ROOT_DIR} ${TAGGER_DIR} ${WORKING_DIR} ${MODEL_DIR} ${SCRIPT_DIR} ${INPUT_FILE}
# --> Append Brown Clusters on the end of each word.
python ${SCRIPT_DIR}/AugumentBrownClusteringFeature46.py ${MODEL_DIR}/twitter_brown_clustering_full ${WORKING_DIR}/tagger.out N > ${WORKING_DIR}/tag.br.out
rm ${WORKING_DIR}/tagger.out
# --> Run Token Selection Tool to get the token selections appended on the end of each word.
python ${TOKENSEL_DIR}/pipeline.py ${WORKING_DIR}/tag.br.out ${MODEL_DIR}/tokensel_weights > ${WORKING_DIR}/test
rm ${WORKING_DIR}/tag.br.out
# -- Start Parsing.
cd ${PARSER_DIR}
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:`pwd;`/deps/local/lib:"
# --> Parse the first time using PTB model to get the scores
rm -f -r ${WORKING_DIR}/test_score
mkdir ${WORKING_DIR}/test_score
./TurboParser --test --file_model=${MODEL_DIR}/ptb_parsing_model --file_test=${WORKING_DIR}/test --file_prediction=${WORKING_DIR}/ptb_single_predict_test --output_posterior=true --use_posterior=false --posterior_dir=${WORKING_DIR}/test_score --logtostderr
# --> Parse the second time using PTB score as features to get the final results
./TurboParser --test --file_model=${MODEL_DIR}/parsing_model --file_test=${WORKING_DIR}/test --file_prediction=${WORKING_DIR}/test_predict --output_posterior=false --use_posterior=true --posterior_dir=${WORKING_DIR}/test_score --logtostderr
# -- Output the results.
cd ${ROOT_DIR}
cat ${WORKING_DIR}/test_predict > ${INPUT_FILE}.predict
fi