Source code for tmtk.toolbox.remap_chromosomal_regions

import pandas as pd


[docs]def remap_chromosomal_regions(origin_platform=None, destination_platform=None, datafile=None, flag_indicator='.flag', to_dest=2, start_dest=3, end_dest=4, region_dest=1, chr_origin=2, start_origin=3, end_origin=4, region_origin=1, region_data=0): dest_regions = destination_platform.iloc[:, [to_dest, start_dest, end_dest]] dest_regions = _convert_xy_to_int(dest_regions) orig_regions = origin_platform.iloc[:, [chr_origin, start_origin, end_origin]] orig_regions = _convert_xy_to_int(orig_regions) # Find overlapping regions overlap = _map_multiple_segments_to_gene(dest_regions, orig_regions) segments_region_column = datafile.columns[region_data] flag_columns = _find_flag_columns(datafile, flag_indicator) # Remove any regions without mapping only_scores = overlap[~overlap.isnull()] # Get region ids corresponding to the mapping region_id_mapping = only_scores.apply(lambda x: map_index_to_region_ids(x, origin_platform, region_origin)) # Find the mean value across the mapped regions remapped_regions = region_id_mapping.apply(lambda x: return_mean(datafile, x, flag_columns)) # Create a new data structure with the new_df = pd.DataFrame(columns=datafile.columns, data=remapped_regions) # Add back the region id's new_df[segments_region_column] = destination_platform.iloc[:, region_origin] # Convert flag columns to int if any(flag_columns): new_df[flag_columns] = new_df[flag_columns].applymap(int) return new_df
def _find_flag_columns(datafile, flag_indicator): """ :param datafile: :param flag_indicator: pattern to look for flag column :return: list with columns or None """ col_names_contain_flag = datafile.columns.str.contains(flag_indicator) if sum(col_names_contain_flag) > 0: # if a flag column found, return list with flag_columns = datafile.columns[col_names_contain_flag] return flag_columns else: return pd.Series([]) def _convert_xy_to_int(df): df = df.replace(to_replace='X', value=23) df = df.replace(to_replace='Y', value=24) df = df.applymap(int) return df def _find_overlapping_segments(chrom, start, end, regions): which = (regions.iloc[:, 0] == chrom) & (regions.iloc[:, 2] >= start) & (regions.iloc[:, 1] <= end) selected_segments_index = regions.loc[which].index if len(selected_segments_index) > 0: return list(selected_segments_index) else: return None def _map_multiple_segments_to_gene(from_regions, to_regions): overlap = from_regions.apply(lambda x: _find_overlapping_segments(x.iloc[0], x.iloc[1], x.iloc[2], to_regions), axis=1) return overlap
[docs]def map_index_to_region_ids(gene, origin_platform, region_origin): mappings = [] for i in gene: mappings += [origin_platform.iloc[i, region_origin]] return mappings
[docs]def return_mean(datafile, mapping, flag_columns=None): mapped_regions = pd.DataFrame(datafile[datafile.iloc[:, 0].isin(mapping)]) mean_values = mapped_regions.iloc[:, 1:].applymap(float).mean() if flag_columns.any() and (len(mapping) > 1): mean_values[flag_columns] = (datafile[datafile.iloc[:, 0].isin(mapping)][flag_columns] ).apply(lambda x: pd.value_counts(x).index[0]) return mean_values