Normalize beagle datasets user Kantale
From PyPedia
Contents |
[edit] Documentation
Takes a list of beagle and marker files and applies the following checks:
- Checks if the SNPs are compatible. If the compatibility cannot be corrected by SNP inversion then it is discarded.
- Checks if SNP has null alleles, if so, SNP is removed from study data.
- Checks if two SNPs with same reference code (rs) are in the same position.
- Checks if two SNPs in the same position have the same reference code (rs).
- Checks if a SNP in the study has MAF < MAF_minimum, HWE < HWE_minimum and CR < CR_minimum if any of these criteria are met, the SNP is discarded. (MAF = Minor Allele Frequency, HWE = Hardy Weinberg Equilibrium, CR = Call Rate)
It generates a log file with all inconsistencies found: At the end of this file there is a summary of the problems found:
- SNPs inverted: For Example A/G SNPs in reference , T/C SNPs in study
- Allele problems: Number of SNPs with inconsistent alleles in study and in reference that could not be fixed with flipping
- Position problems (different references, same loci): As it says. These SNPs are NOT removed. We keep the reference (rs number) of the reference panel
- Unresolved single alleles problems: SNPs in study that have only one allele. These SNPs are filtered out.
- Double rs codes problems: As it says. This SNPs are filtered out.
- SNPs in study with MAF < MAF_minimum: SNPs with MAF < MAF_minimum set.
- SNPs in study with HWE < HWE_minimum: SNPs with HWE < HWE_minimum set.
- SNPs in study with CR < CR_minimum: SNPs with Call Rate < CR_minimum set
- SNPs that differ in Allele Frequencies: SNPs with difference in AF between reference and study over CR_minimum set.
[edit] Parameters
<inputs> </inputs>
[edit] Return
[edit] See also
[edit] Code
import sys def Normalize_beagle_datasets_user_Kantale( input_beagle_filenames = None, input_markers_filenames = None, output_beagle_filenames = None, output_markers_filenames = None, output_markers_filename = None, missing = "0", MAF_minimum = 0.01, HWE_minimum = 0.0001, CR_minimum = 0.95, AF_difference = 0.25, input_types = None, #A list indicating whether input_beagle_filenames are "study" or "reference" log_filename = None, ): invert_dict = {"A":"T", "C":"G", "G":"C", "T":"A"} datasets = len(input_beagle_filenames) #Dictionary to store rs codes rs_codes = {} #Open all markers input_markers_files = [open(x) for x in input_markers_filenames] #open all markers input_beagle_files = [open(x) for x in input_beagle_filenames] #Open output files output_beagle_files = [open(x, "w") for x in output_beagle_filenames] output_markers_files = [open(x, "w") for x in output_markers_filenames] if log_filename: log = open(log_filename, "w") else: log = sys.stdout finished_files = [False] * datasets number_of_samples = [0] * datasets current_markers_lines_s = [x.readline().replace("\n", "").split() for x in input_markers_files] current_beagle_lines_s = [None] * len(input_beagle_filenames) for index in xrange(datasets): while True: line = input_beagle_files[index].readline() line_s = line.replace("\n", "").split() if line_s[0] == "M": current_beagle_lines_s[index] = list(line_s) number_of_samples[index] = len(line_s) - 2 break else: output_beagle_files[index].write(line) lower_locus = min([int(current_markers_lines_s[x][1]) for x in range(datasets)]) line_counter = 0 problems_alleles = 0 problems_positions = 0 problems_single_allele = 0 problems_double_rs = 0 problems_too_low_maf = 0 problems_too_low_hwe = 0 problems_too_low_cr = 0 problems_differ_afs = 0 problems_SNP_not_in_reference = 0 SNPs_inverted = 0 while True: line_counter += 1 MAFs = [] #Minor Allele Frequencies HWEs = [] #Hardy Weinberg Equilibriums CRs = [] #Call Rates if line_counter % 10000 == 0: print "Lines:", line_counter, " Current locus:", lower_locus, Print_now_user_Kantale() #Which do they have minimum locus? lower_locus_list = [x for x in range(datasets) if int(current_markers_lines_s[x][1]) == lower_locus] #print lower_locus_list, lower_locus #Take the first as reference: reference = current_markers_lines_s[lower_locus_list[0]][2:4] reference_rs = current_markers_lines_s[lower_locus_list[0]][0] #Take the references of the rest references = [current_markers_lines_s[x][2:4] for x in lower_locus_list] references_rs = [current_markers_lines_s[x][0] for x in lower_locus_list] #print "lower_locus:", lower_locus #print "lower_locus_list:", lower_locus_list lines_beagle = {} lines_markers = {} problem = False study_problem = False found_SNP_in_reference = False for x in range(datasets): MAFs += [None] HWEs += [None] CRs += [None] if x in lower_locus_list: x_index = lower_locus_list.index(x) #If this SNP is in reference we make a note if input_types[x] == "reference": found_SNP_in_reference = True #Detect single allele if references[x_index][0] == references[x_index][1]: if input_types[x] == "study": problem = True problems_single_allele += 1 log.write("File: " + input_beagle_filenames[x] + " Reference:" + reference_rs + " Pos:" + str(lower_locus) + " Single allele: " + references[x_index][0] + "\n") #The lines to be saved line_beagle = ["M", reference_rs] line_markers = [reference_rs, str(lower_locus)] + reference if reference_rs != references_rs[x_index]: #This is just a warning. Finally we save the rs of the reference study for all files. log.write("Warning, Different references in same loci. Loci: " + str(lower_locus) + " File1: " + input_beagle_filenames[lower_locus_list[0]] + " reference:" + reference_rs + " File2:" + input_beagle_filenames[x] + " reference:" + references_rs[x_index] + "\n") problems_positions += 1 if reference[0] == references[x_index][0] and reference[1] == references[x_index][1]: line_beagle += current_beagle_lines_s[x][2:] elif reference[0] == references[x_index][1] and reference[1] == references[x_index][0]: line_beagle += current_beagle_lines_s[x][2:] elif reference[0] == invert_dict[references[x_index][0]] and reference[1] == invert_dict[references[x_index][1]]: SNPs_inverted += 1 for y in current_beagle_lines_s[x][2:]: if y == missing: line_beagle += [missing] else: line_beagle += [invert_dict[y]] elif reference[0] == invert_dict[references[x_index][1]] and reference[1] == invert_dict[references[x_index][0]]: SNPs_inverted += 1 for y in current_beagle_lines_s[x][2:]: if y == missing: line_beagle += [missing] else: line_beagle += [invert_dict[y]] else: log.write( "Pos: " + str(lower_locus) + " Reference: " + reference_rs + "\n") log.write( "Filename: " + input_beagle_filenames[lower_locus_list[0]] + " Alleles:" + reference[0] + reference[1] + "\n") log.write( "Filename: " + input_beagle_filenames[x] + " Alleles:" + references[x_index][0] + references[x_index][1] + "\n") problems_alleles += 1 problem = True #Estimate Minor Allele Frequency, Hardy Weinberg Equilibrium and Call Rate if not problem: observations_chr1 = [y for index, y in enumerate(current_beagle_lines_s[x][2:]) if index % 2 == 0] observations_chr2 = [y for index, y in enumerate(current_beagle_lines_s[x][2:]) if index % 2 == 1] allele1 = current_markers_lines_s[x][2] allele2 = current_markers_lines_s[x][3] observations = zip(observations_chr1, observations_chr2) MAFs[x] = Minor_allele_frequency_user_Kantale( allele1 = allele1, allele2 = allele2, observations_chr1 = observations_chr1, observations_chr2 = observations_chr2, ) HWEs[x] = Hardy_Weinberg_Equilibrium_exact_test_user_Kantale( obs_hets = len([y for y in observations if (y[0] == allele1 and y[1] == allele2) or (y[0] == allele2 and y[1] == allele1)]), obs_hom1 = len([y for y in observations if y[0] == allele1 and y[1] == allele1]), obs_hom2 = len([y for y in observations if y[0] == allele2 and y[1] == allele2]), ) CRs[x] = float(len(observations_chr1) + len(observations_chr2)) / float(len([y for y in observations_chr1 + observations_chr2 if y != missing])) if MAFs[x] < MAF_minimum and input_types[x] == "study": log.write( "Study: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + " with MAF: " + str(MAFs[x]) + " < " + str(MAF_minimum)) study_problem = True problems_too_low_maf += 1 elif HWEs[x] < HWE_minimum and input_types[x] == "study": log.write( "Study: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + " with HWE: " + str(HWEs[x]) + " < " + str(HWE_minimum)) study_problem = True problems_too_low_hwe += 1 elif CRs[x] < CR_minimum and input_types[x] == "study": log.write( "Study: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + " with CR: " + str(HWEs[x]) + " < " + str(CR_minimum)) study_problem = True problems_too_low_cr += 1 #Have we seen the reference before? if rs_codes.has_key(references_rs[x_index]): log.write( "Filename: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + "\n") log.write( "This rs code has been met before in POS:" + str(rs_codes[references_rs[x_index]]) + "\n") problem = True problems_double_rs += 1 lines_beagle[x] = list(line_beagle) lines_markers[x] = list(line_markers) #Check difference in Allele Frequencies for study_maf in [MAFs[x] for x in range(datasets) if input_types[x] == "study" and MAFs[x]]: for reference_maf in [MAFs[y] for y in range(datasets) if input_types[y] == "reference" and MAFs[y]]: if abs(study_maf[0] - reference_maf[0]) > AF_difference: log.write( "Filename: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + "\n") log.write( "This SNP's Allele Frequency differes with Filename's: " + input_beagle_filenames[y] + " AF more than " + str(AF_difference) + "\n") problem = True problems_differ_afs += 1 #Check if the SNP was found in the reference: if not found_SNP_in_reference: log.write("Filename: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + "\n") log.write("This SNP does not exist in the reference\n") problem = True problems_SNP_not_in_reference += 1 #Print lines for x in range(datasets): if x in lower_locus_list: #Is this SNP filtered from the study? if study_problem and input_types[x] == "study": pass #If this SNPS is problematic remove from both reference and study elif not problem: rs_codes[reference_rs] = lower_locus output_beagle_files[x].write(str.join(" ", lines_beagle[x]) + "\n") output_markers_files[x].write(str.join(" ", lines_markers[x]) + "\n") #For these in lower_locus find next line for x in lower_locus_list: if not finished_files[x]: new_line = input_markers_files[x].readline() if not new_line: finished_files[x] = True log.write( "Finished file:" + input_beagle_filenames[x] + "\n") else: current_markers_lines_s[x] = new_line.replace("\n", "").split() current_beagle_lines_s[x] = input_beagle_files[x].readline().replace("\n", "").split() current_loci = [int(current_markers_lines_s[x][1]) for x in range(datasets) if not finished_files[x] ] if not current_loci: break lower_locus = min(current_loci) [x.close() for x in input_beagle_files] [x.close() for x in input_markers_files] [x.close() for x in output_beagle_files] [x.close() for x in output_markers_files] # output_markers_file.close() log.write( "SNPs inverted: " + str(SNPs_inverted) + "\n") log.write( "Allele problems: " + str(problems_alleles) + "\n") log.write( "Position problems (different references, same loci): " + str(problems_positions) + "\n") log.write( "Unresolved single alleles problems: " + str(problems_single_allele) + "\n") log.write( "Double rs codes problems: " + str(problems_double_rs) + "\n") log.write( "SNPs in study with MAF < MAF_minimum: " + str(problems_too_low_maf) + "\n") log.write( "SNPs in study with HWE < HWE_minimum: " + str(problems_too_low_hwe) + "\n") log.write( "SNPs in study with CR < CR_minimum: " + str(problems_too_low_cr) + "\n") log.write( "SNPs that differ in Allele Frequencies: " + str(problems_differ_afs) + "\n") log.write( "SNPs that are not present in reference: " + str(problems_SNP_not_in_reference) + "\n") if log_filename: log.close()
[edit] Unit Tests
def uni1(): return True
[edit] Development Code
def Normalize_beagle_datasets_user_Kantale(): pass
[edit] Permissions
[edit] Documentation Permissions
Kantale
[edit] Code Permissions
Kantale
[edit] Unit Tests Permissions
Kantale