-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_unintend_prod.py
239 lines (211 loc) · 14.2 KB
/
check_unintend_prod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from BLAST_utils import check_blastDB
from Bowtie_utils import check_BowtieIdx
from subprocess import Popen
from subprocess import PIPE
from subprocess import DEVNULL
import os
import pandas as pd
def check_unintended_products(dict_primers, len_input, ref, cut_chr , cut_coord, nonspecific_primers,thread, fhlog,outdir, aligner="Bowtie"):
"""
checks unintended_products and remove primers having unintended products
:param dict_primers:
:param Aligner: BLAST or Bowtie
:param len_input: the length of primer
:return: dict_primers(updated) #add a key PRIMER_PAIR_NUM_RETURNED_SPECIFIC
"""
dict_primers["UNSPECIFIC_PRIMER_PAIR_idx"] = set()
if thread == "auto":
numThread2use = max([1, os.cpu_count()-2]) # for BLAST, use all CPUs except 2
else:
numThread2use = int(thread)
if dict_primers["PRIMER_PAIR_NUM_RETURNED"] == 0:
return dict_primers
max_pcr_prod_size = 6000
max_genomic_hits = 5000 #max number of hsps across all hits allowed for each primer, offending primers will not be considered
#print(f"Listing all possible PCR products < {max_pcr_prod_size} bp", flush=True)
fhlog.write(f"Listing all possible PCR products < {max_pcr_prod_size} bp\n")
dict_primer_len = {} #used to track the length of the primers, for 3' tracking purposes
#create primer.fa
empty_file_flag = 1 # this value =1 if primer.fa is empty
tmp_fa = f"chr{cut_chr}_{cut_coord}_primer.fa"
tmp_fa = os.path.join(outdir,tmp_fa)
with open(tmp_fa, "w") as wfh:
for i in range(0, dict_primers["PRIMER_PAIR_NUM_RETURNED"]):
Lseq = dict_primers["PRIMER_LEFT_{}_SEQUENCE".format(i)]
Rseq = dict_primers["PRIMER_RIGHT_{}_SEQUENCE".format(i)]
#check if primers are in nonspecific list
if not f"{Lseq}{Rseq}" in nonspecific_primers.keys():
wfh.write(f">{i}_F\n{Lseq}\n>{i}_R\n{Rseq}\n")
dict_primer_len[f"{i}_F"] = len(Lseq)
dict_primer_len[f"{i}_R"] = len(Rseq)
empty_file_flag = 0
else:
#print(f"skipping {i}_F {i}_R due to nonspecific product detected in previous iteration(s)")
fhlog.write(f"skipping {i}_F {i}_R due to nonspecific product detected in previous iteration(s)\n")
dict_primers["UNSPECIFIC_PRIMER_PAIR_idx"].add(i)
if empty_file_flag==0:
if aligner == "BLAST" :
#blast
# check blastDB (human) and also return BLAST bin directory
BLAST_bin, exe_suffix, BLAST_db_path = check_blastDB(ref)
# print(BLAST_bin)
# print(exe_suffix)
# print(BLAST_db_path)
# specify BLAST db and query
query = tmp_fa
# start BLAST
cmd = [f"{BLAST_bin}blastn{exe_suffix}", "-task", "blastn-short", "-query", f"{query}", "-db", f"{BLAST_db_path}", "-max_hsps", "2000", "-num_threads", f"{numThread2use}", "-perc_identity", "75", "-outfmt", "6 qseqid sseqid qstart qend sstart send pident mismatch", "-out", f"{query}.out"]
p = Popen(cmd, universal_newlines=True)
p.communicate() # now wait for the process to finish
os.remove(tmp_fa)
#pre-parse alignment out, flag primers with too many hits in the genome
uniq_cout = f"{query}.out.uniq_count"
with open(uniq_cout, "w") as f, open(f"{query}.out", "r") as i:
uniq_count = dict()
for line in i:
pname = line.split("\t")[0]
if pname in uniq_count.keys():
uniq_count[pname] += 1
else:
uniq_count[pname] = 1
for k in uniq_count.keys():
f.write(f"{uniq_count[k]}\t{k}\n")
to_remove = []
with open(uniq_cout, "r") as f:
for line in f:
line = line.lstrip().rstrip()
num,primer = line.split("\t")
#print(f"{num},{primer}")
if int(num) >= max_genomic_hits:
to_remove.append(primer) #remove primers with too many hits in the genome
#print(f"to_remove:{to_remove}")
os.remove(uniq_cout)
with open(f"{query}.out", "r") as f, open(f"{query}.out.preparsed", "w") as w:
for line in f:
primer = line.split("\t")[0]
if not primer in to_remove:
w.write(line)
os.remove(f"{query}.out")
elif aligner == "Bowtie":
Bowtie_bin, exe_suffix, Bowtie_idx_path = check_BowtieIdx(ref)
query = tmp_fa
# run Bowtie
with open(f"{query}.out", "w") as outfile:
cmd = [f"{Bowtie_bin}bowtie{exe_suffix}", "-f", "-k", "1000", "-v", "3", "--threads", f"{numThread2use}", "--suppress", "5,6", "--quiet", "-x", f"{Bowtie_idx_path}", f"{query}"]
p = Popen(cmd, universal_newlines=True, stdout=outfile, stderr=DEVNULL)
p.communicate() # now wait for the process to finish
os.remove(tmp_fa)
# parse Bowtie out, flag primers with too many hits in the genome, remove alignments that have mismatches after the 15th base
to_remove = []
with open(f"{query}.out.preparsed", "w") as f, open(f"{query}.out", "r") as r:
uniq_count = dict()
for line in r:
fields = line.strip().split("\t")
pname = fields[0]
largest_mismatch_pos = 0
num_missmatch = 0
#check mismatch_desc fields to update largest_mismatch_pos
if len(fields) > 5:
mismatch_desc = fields[5]
mismatch_desc_fields = mismatch_desc.split(",")
num_missmatch = len(mismatch_desc_fields)
if len(mismatch_desc_fields) > 0: #mismatch_desc string is not empty
for c in mismatch_desc_fields:
mismatch_pos = int(c.split(":")[0])
if mismatch_pos > largest_mismatch_pos:
largest_mismatch_pos = mismatch_pos
if largest_mismatch_pos <= 15: # require no mismatch after the 15th base
if int(fields[4]) <= max_genomic_hits: # require no more than max_genomic_hits hits
strand = fields[1]
if strand == "+":
sstart = int(fields[3])
send = int(fields[3]) + 20 # use avg primer length of 20
else:
sstart = int(fields[3]) + 20 # use avg primer length of 20
send = int(fields[3])
pident = 100 - num_missmatch/20*100
f.write(f"{pname}\t{fields[2]}\t1\t20\t{sstart}\t{send}\t{pident}\t{num_missmatch}\n")
else:
to_remove.append(pname) #remove primers with too many hits in the genome
os.remove(f"{query}.out")
else:
sys.exit(f"[ERROR] Invalid choice of aligner: {aligner}")
#flag all the removed-primers in dict_primers as UNSPECIFIC_PRIMER_PAIR
for primer in to_remove:
idx = primer.split("_")[0]
dict_primers["UNSPECIFIC_PRIMER_PAIR_idx"].add(idx)
#print(f"idx:{idx} removed")
#parse blast out
df = pd.read_csv(f"{query}.out.preparsed", sep = "\t", names=["qseqid", "sseqid", "qstart", "qend", "sstart", "send","pident", "mismatch"])
os.remove(f"{query}.out.preparsed")
# go through each primer pair
primer_names = df["qseqid"].unique()
primer_idx = set([i.split("_")[0] for i in df["qseqid"].unique()])
primer_idx = sorted(primer_idx, key=float) #sort
## go through all primer pairs
for idx in primer_idx:
df_1pair = df[(df["qseqid"] == idx + "_F") | (df["qseqid"] == idx + "_R")]
dict_primers = find_primer_parings(df_1pair=df_1pair,dict_primers=dict_primers, max_pcr_prod_size = max_pcr_prod_size, dict_primer_len=dict_primer_len, idx= idx, cut_chr =cut_chr, cut_coord = cut_coord, fhlog = fhlog)
return dict_primers
def find_primer_parings(df_1pair, dict_primers, max_pcr_prod_size, dict_primer_len, idx,cut_chr, cut_coord, fhlog):
## go through by each chr
for current_chr in df_1pair["sseqid"].unique():
df_1pair_1chr = df_1pair[df_1pair["sseqid"] == current_chr]
#subset the dataframe (require primer 3' end to match), othewise the df may be very big
F_len = dict_primer_len[f"{idx}_F"]
R_len = dict_primer_len[f"{idx}_R"]
df_1pair_1chr = df_1pair_1chr[((df_1pair_1chr["qseqid"] == f"{idx}_F") & (df_1pair_1chr["qend"] == F_len)) |
((df_1pair_1chr["qseqid"] == f"{idx}_R") & (df_1pair_1chr["qend"] == R_len))]
current_pair_intended_prod_count = 0 #keep track of the number of intented targets for each primer pair
##go through all possible combinations that may produce PCR products
combinations = []
for idx_i, row_i in df_1pair_1chr.iterrows():
#get subset of primers that are within [max_pcr_prod_size] distance away from the current primer
anchor = row_i["sstart"]
df_1pair_1chr_subset = df_1pair_1chr[(df_1pair_1chr["sstart"] >= (anchor - max_pcr_prod_size - 100)) & (df_1pair_1chr["sstart"] <= (anchor + max_pcr_prod_size + 100))]
for idx_j, row_j in df_1pair_1chr_subset.iterrows():
if not set([idx_i, idx_j]) in combinations: # avoid i-j j-i repeats
combinations.append(set([idx_i, idx_j]))
strand_i = row_i["sstart"] - row_i["send"]
strand_j = row_j["sstart"] - row_j["send"]
coord_5p_i = row_i["sstart"]
coord_3p_i = row_i["send"]
coord_5p_j = row_j["sstart"]
coord_3p_j = row_j["send"]
# for idx_i, row_i in df_1pair_1chr.iterrows():
# for idx_j, row_j in df_1pair_1chr.iterrows():
# if not set([idx_i, idx_j]) in combinations: # avoid i-j j-i repeats
# combinations.append(set([idx_i, idx_j]))
# strand_i = row_i["sstart"] - row_i["send"]
# strand_j = row_j["sstart"] - row_j["send"]
# coord_5p_i = row_i["sstart"]
# coord_3p_i = row_i["send"]
# coord_5p_j = row_j["sstart"]
# coord_3p_j = row_j["send"]
# require matches be on different strands, and facing each other
if strand_i * strand_j < 0 and (
min([coord_5p_i, coord_5p_j]) < coord_3p_i < max([coord_5p_i, coord_5p_j])) and (
min([coord_5p_i, coord_5p_j]) < coord_3p_j < max([coord_5p_i, coord_5p_j])):
max_coord = max([row_i["sstart"], row_i["send"], row_j["sstart"], row_j["send"]])
min_coord = min([row_i["sstart"], row_i["send"], row_j["sstart"], row_j["send"]])
dist = max_coord - min_coord
# require the PCR product to be between 20bp and max_pcr_prod_size
if 20 <= dist <= max_pcr_prod_size:
if float(row_i["pident"]) >= 75.0 and float(row_j["pident"]) >= 75.0: # row_i["mismatch"]<=3 and row_j["mismatch"]<=3 and
# require less than 3 mismatches
if row_i["qend"] == dict_primer_len[row_i["qseqid"]] and row_j["qend"] == dict_primer_len[row_j["qseqid"]]: # require the 3' end to match
# check if this product is intended
if str(current_chr) == str(cut_chr) and min_coord < cut_coord < max_coord: # intended product
# pass
#print(f"{row_i['qseqid']} chr={row_i['sseqid']} {row_i['sstart']}-{row_i['send']} {row_j['qseqid']} chr={row_j['sseqid']} {row_j['sstart']}-{row_j['send']} product_size = {dist} (intended PCR product)",flush=True)
fhlog.write(f"{row_i['qseqid']} chr={row_i['sseqid']} {row_i['sstart']}-{row_i['send']} {row_j['qseqid']} chr={row_j['sseqid']} {row_j['sstart']}-{row_j['send']} product_size = {dist} (intended PCR product)\n")
current_pair_intended_prod_count += 1
if current_pair_intended_prod_count > 1: #flag primer pair unspecific if more than one intended PCR product is found
dict_primers["UNSPECIFIC_PRIMER_PAIR_idx"].add(idx)
else: # flag primer for having unintened product
# mark primers with unintended products
dict_primers["UNSPECIFIC_PRIMER_PAIR_idx"].add(idx)
#print(f"{row_i['qseqid']} chr={row_i['sseqid']} {row_i['sstart']}-{row_i['send']} {row_j['qseqid']} chr={row_j['sseqid']} {row_j['sstart']}-{row_j['send']} product_size = {dist} (*unintended* PCR product) Skipping listing other PCR products for this primer",flush=True)
fhlog.write(f"{row_i['qseqid']} chr={row_i['sseqid']} {row_i['sstart']}-{row_i['send']} {row_j['qseqid']} chr={row_j['sseqid']} {row_j['sstart']}-{row_j['send']} product_size = {dist} (*unintended* PCR product) Skipping listing other PCR products for this primer\n")
return dict_primers # this ends the function, thus stops checking this primer any further
return dict_primers