Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add new little requested features #875

Merged
merged 9 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 41 additions & 19 deletions hicexplorer/chicAggregateStatistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,31 @@ def parse_arguments(args=None):
return parser


def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetIntervalTree=None, pTargetFile=None):
def filter_scores_target_list(pScoresDictionary, pTargetFType, pTargetPosDict, pTargetList=None, pTargetIntervalTree=None, pTargetFile=None):

accepted_scores = {}
same_target_dict = {}
target_regions_intervaltree = None
if pTargetList is not None:

# newly added
if pTargetFType == 'hdf5':
# read hdf content for this specific combination
targetFileHDF5Object = h5py.File(pTargetFile, 'r')
target_object = targetFileHDF5Object['/'.join(pTargetList)]
chromosome = target_object.get('chromosome')[()].decode("utf-8")
start_list = list(target_object['start_list'][:])
end_list = list(target_object['end_list'][:])
targetFileHDF5Object.close()
elif pTargetFType == 'bed4':
chromosome = pTargetPosDict[pTargetList[-1]]['chromosome']
start_list = pTargetPosDict[pTargetList[-1]]['start_list']
end_list = pTargetPosDict[pTargetList[-1]]['end_list']
elif pTargetFType == 'bed3':
target_regions_intervaltree = pTargetIntervalTree
else:
log.error('No target list given.')
raise Exception('No target list given.')

if pTargetList is not None:
chromosome = [chromosome] * len(start_list)

target_regions = list(zip(chromosome, start_list, end_list))
Expand All @@ -85,12 +96,6 @@ def filter_scores_target_list(pScoresDictionary, pTargetList=None, pTargetInterv

hicmatrix = hm.hiCMatrix()
target_regions_intervaltree = hicmatrix.intervalListToIntervalTree(target_regions)[0]
elif pTargetIntervalTree is not None:
target_regions_intervaltree = pTargetIntervalTree

else:
log.error('No target list given.')
raise Exception('No target list given.')

for key in pScoresDictionary:
chromosome = pScoresDictionary[key][0]
Expand Down Expand Up @@ -193,12 +198,12 @@ def writeAggregateHDF(pOutFileName, pOutfileNamesList, pAcceptedScoresList, pArg
aggregateFileH5Object.close()


def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pViewpointObj, pQueue=None, pOneTarget=False):
def run_target_list_compilation(pInteractionFilesList, pTargetList, pTargetFType, pTargetPosDict, pArgs, pViewpointObj, pQueue=None, pOneTarget=False):
outfile_names_list = []
accepted_scores_list = []
target_regions_intervaltree = None
try:
if pOneTarget == True:
if pTargetFType == 'bed3':
try:
target_regions = utilities.readBed(pTargetList)
except Exception as exp:
Expand All @@ -211,14 +216,13 @@ def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pView
outfile_names_list_intern = []
accepted_scores_list_intern = []
for sample in interactionFile:

interaction_data, interaction_file_data, _ = pViewpointObj.readInteractionFile(pArgs.interactionFile, sample)
if pOneTarget == True:
target_file = None
else:
target_file = pTargetList[i]

accepted_scores = filter_scores_target_list(interaction_file_data, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile)
accepted_scores = filter_scores_target_list(interaction_file_data, pTargetFType, pTargetPosDict, pTargetList=target_file, pTargetIntervalTree=target_regions_intervaltree, pTargetFile=pArgs.targetFile)

outfile_names_list_intern.append(sample)
accepted_scores_list_intern.append(accepted_scores)
Expand All @@ -238,7 +242,7 @@ def run_target_list_compilation(pInteractionFilesList, pTargetList, pArgs, pView
return


def call_multi_core(pInteractionFilesList, pTargetFileList, pFunctionName, pArgs, pViewpointObj):
def call_multi_core(pInteractionFilesList, pTargetFileList, pTargetFType, pTargetPosDict, pFunctionName, pArgs, pViewpointObj):
if len(pInteractionFilesList) < pArgs.threads:
pArgs.threads = len(pInteractionFilesList)
outfile_names_list = [None] * pArgs.threads
Expand Down Expand Up @@ -272,6 +276,8 @@ def call_multi_core(pInteractionFilesList, pTargetFileList, pFunctionName, pArgs
process[i] = Process(target=pFunctionName, kwargs=dict(
pInteractionFilesList=interactionFileListThread,
pTargetList=targetFileListThread,
pTargetFType=pTargetFType,
pTargetPosDict=pTargetPosDict,
pArgs=pArgs,
pViewpointObj=pViewpointObj,
pQueue=queue[i],
Expand Down Expand Up @@ -318,16 +324,32 @@ def main(args=None):

targetList = []
present_genes = {}
target_ftype = ''
targetPosDict = None
# read hdf file
interactionFileHDF5Object = h5py.File(args.interactionFile, 'r')
keys_interactionFile = list(interactionFileHDF5Object.keys())

if h5py.is_hdf5(args.targetFile):

targetDict, present_genes = viewpointObj.readTargetHDFFile(args.targetFile)
target_ftype = 'hdf5'

else:
targetList = [args.targetFile]
with open(args.targetFile) as file:
for line in file.readlines():
if line.startswith('#'):
continue
_line = line.strip().split('\t')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find the variable name _line confusing. They are now elements or something like that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be consistent with the remaining project I used _line. The same line of code can be found several times.

break
if len(_line) == 4:
log.info('Targets BED contains 4 columns, aggregating on column 4')
target_ftype = 'bed4'
present_genes, targetDict, targetPosDict = utilities.readTargetBed(args.targetFile)
elif len(_line) == 3:
targetList = [args.targetFile]
target_ftype = 'bed3'
else:
log.error('BED of targets list must have 3 or 4 columns')

if len(keys_interactionFile) > 1:

Expand All @@ -346,7 +368,7 @@ def main(args=None):
geneList2 = sorted(list(matrix_obj2[chromosome2].keys()))

for gene1, gene2 in zip(geneList1, geneList2):
if h5py.is_hdf5(args.targetFile):
if target_ftype != 'bed3':
if gene1 in present_genes[sample][sample2]:
interactionDict[gene1] = [[sample, chromosome1, gene1], [sample2, chromosome2, gene2]]
else:
Expand All @@ -356,7 +378,7 @@ def main(args=None):

interactionFileHDF5Object.close()

if h5py.is_hdf5(args.targetFile):
if target_ftype != 'bed3':
key_outer_matrix = present_genes.keys()
for keys_outer in key_outer_matrix:
keys_inner_matrix = present_genes[keys_outer].keys()
Expand All @@ -365,5 +387,5 @@ def main(args=None):
interactionList.append(interactionDict[gene])
targetList.append(targetDict[gene])

outfile_names_list, accepted_scores_list = call_multi_core(interactionList, targetList, run_target_list_compilation, args, viewpointObj)
outfile_names_list, accepted_scores_list = call_multi_core(interactionList, targetList, target_ftype, targetPosDict, run_target_list_compilation, args, viewpointObj)
writeAggregateHDF(args.outFileName, outfile_names_list, accepted_scores_list, args)
12 changes: 11 additions & 1 deletion hicexplorer/hicCorrectMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ def correct_subparser():
'of chromosomes and/or translocations.',
action='store_true')

parserOpt.add_argument('--filteredBed',
help='Print bins filtered our by --filterThreshold to this file')

parserOpt.add_argument('--verbose',
help='Print processing status.',
action='store_true')
Expand Down Expand Up @@ -548,6 +551,7 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
to avoid introducing bias due to different chromosome numbers

"""
log.info("filtering by z-score")
to_remove = []
if perchr:
for chrname in list(hic_ma.interval_trees):
Expand Down Expand Up @@ -575,6 +579,7 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
"\n".format(chrname, lower_threshold, upper_threshold))

to_remove.extend(problematic)

else:
row_sum = np.asarray(hic_ma.matrix.sum(axis=1)).flatten()
# subtract from row sum, the diagonal
Expand All @@ -584,7 +589,6 @@ def filter_by_zscore(hic_ma, lower_threshold, upper_threshold, perchr=False):
mad = MAD(row_sum)
to_remove = np.flatnonzero(mad.is_outlier(
lower_threshold, upper_threshold))

return sorted(to_remove)


Expand Down Expand Up @@ -658,6 +662,12 @@ def main(args=None):
restore_masked_bins=False)

assert matrix_shape == ma.matrix.shape

if args.filteredBed:
with open(args.filteredBed, 'w') as f:
for outlier_region in set(outlier_regions):
interval = ma.cut_intervals[outlier_region]
f.write(f"{interval[0]}\t{interval[1]}\t{interval[2]}\t.\t{interval[3]}\t.\n")
# mask filtered regions
ma.maskBins(outlier_regions)
total_filtered_out = set(outlier_regions)
Expand Down
76 changes: 58 additions & 18 deletions hicexplorer/test/general/test_chicAggregateStatistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,41 +113,81 @@ def test_regular_mode_threads():
aggregateFileH5Object.close()


def test_target_list():
def test_target_list_bed3():
outfile_aggregate = NamedTemporaryFile(suffix='.hdf5', delete=False)
outfile_aggregate.close()
args = "--interactionFile {} --targetFile {} --outFileName {} \
-t {}".format(ROOT + 'chicViewpoint/two_matrices.hdf5',
ROOT + 'chicAggregateStatistic/target_list.bed',
-t {}".format(ROOT + 'chicViewpoint/two_matrices_custom_keys.hdf5',
ROOT + 'chicAggregateStatistic/target_list_3col.bed',
outfile_aggregate.name, 10).split()
chicAggregateStatistic.main(args)

aggregateFileH5Object = h5py.File(outfile_aggregate.name, 'r')
assert 'FL-E13-5_chr1_MB-E10-5_chr1' in aggregateFileH5Object
assert 'FL-E13-5_chr1' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']
assert 'MB-E10-5_chr1' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']
assert 'c_adj_norm_t_adj_norm' in aggregateFileH5Object
assert 'c_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']
assert 't_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']

assert 'genes' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1']
assert 'genes' in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1']
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']
assert len(aggregateFileH5Object) == 1
assert aggregateFileH5Object.attrs['type'] == 'aggregate'

for chromosome in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1']:
for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']:

assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome]) == 3
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]) == 3

for gene in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome]:
assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome][gene]) == 7
for data in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['FL-E13-5_chr1'][chromosome][gene]:
for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]:
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]) == 7
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]:
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']

for chromosome in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1']:
for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']:

assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome]) == 3
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]) == 3

for gene in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome]:
assert len(aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome][gene]) == 7
for data in aggregateFileH5Object['FL-E13-5_chr1_MB-E10-5_chr1']['MB-E10-5_chr1'][chromosome][gene]:
for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]:
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]) == 7
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]:
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']

aggregateFileH5Object.close()


def test_target_list_bed4():
outfile_aggregate = NamedTemporaryFile(suffix='.hdf5', delete=False)
outfile_aggregate.close()
args = "--interactionFile {} --targetFile {} --outFileName {} \
-t {}".format(ROOT + 'chicViewpoint/two_matrices_custom_keys.hdf5',
ROOT + 'chicAggregateStatistic/target_list_4col.bed',
outfile_aggregate.name, 10).split()
chicAggregateStatistic.main(args)

aggregateFileH5Object = h5py.File(outfile_aggregate.name, 'r')
assert 'c_adj_norm_t_adj_norm' in aggregateFileH5Object
assert 'c_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']
assert 't_adj_norm' in aggregateFileH5Object['c_adj_norm_t_adj_norm']

assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']
assert 'genes' in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']
assert len(aggregateFileH5Object) == 1
assert aggregateFileH5Object.attrs['type'] == 'aggregate'

for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm']:

assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]) == 3

for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome]:
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]) == 7
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['c_adj_norm'][chromosome][gene]:
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']

for chromosome in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm']:

assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]) == 3

for gene in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome]:
assert len(aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]) == 7
for data in aggregateFileH5Object['c_adj_norm_t_adj_norm']['t_adj_norm'][chromosome][gene]:
assert data in ['chromosome', 'end_list', 'gene_name', 'raw_target_list', 'relative_distance_list', 'start_list', 'sum_of_interactions']

aggregateFileH5Object.close()
18 changes: 16 additions & 2 deletions hicexplorer/test/general/test_hicCorrectMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,35 @@
os.path.dirname(os.path.abspath(__file__))), "test_data/")


def are_files_equal(file1, file2, pDifference=1):
with open(file1) as textfile1, open(file2) as textfile2:
for x, y in zip(textfile1, textfile2):
if x != y:
count = sum(1 for a, b in zip(x, y) if a != b)
if count > pDifference:
return False
return True


def test_correct_matrix_ICE():
outfile = NamedTemporaryFile(suffix='.ICE.h5', delete=False)
outfile.close()

outfile_filtered = NamedTemporaryFile(suffix='.bed', delete=True)

args = "correct --matrix {} --correctionMethod ICE --chromosomes "\
"chrUextra chr3LHet --iterNum 500 --outFileName {} "\
"chrUextra chr3LHet --iterNum 500 --outFileName {} --filteredBed {} "\
"--filterThreshold -1.5 5.0".format(ROOT + "small_test_matrix.h5",
outfile.name).split()
outfile.name,
outfile_filtered.name).split()
# hicCorrectMatrix.main(args)
compute(hicCorrectMatrix.main, args, 5)
test = hm.hiCMatrix(
ROOT + "hicCorrectMatrix/small_test_matrix_ICEcorrected_chrUextra_chr3LHet.h5")
new = hm.hiCMatrix(outfile.name)
nt.assert_equal(test.matrix.data, new.matrix.data)
nt.assert_equal(test.cut_intervals, new.cut_intervals)
assert are_files_equal(outfile_filtered.name, ROOT + 'hicCorrectMatrix/filtered.bed')

os.unlink(outfile.name)

Expand Down
Binary file not shown.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
chr1 4480000 4549000
chr1 4555000 4688000
chr1 14274000 14279000
chr1 14290000 14439000
chr1 14444000 14467000
chr1 14476000 14501000
chr1 19077000 19118000
chr1 19120000 19274000
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
chr1 4487335 4487535 Sox17
chr1 14300180 14300380 Eya1
chr1 19093003 19093203 Tfap2d
Binary file not shown.
Loading