Normalize beagle datasets user Kantale

From PyPedia
Jump to: navigation, search

Contents

[edit] Documentation

Takes a list of beagle and marker files and applies the following checks:

  • Checks if the SNPs are compatible. If the compatibility cannot be corrected by SNP inversion then it is discarded.
  • Checks if SNP has null alleles, if so, SNP is removed from study data.
  • Checks if two SNPs with same reference code (rs) are in the same position.
  • Checks if two SNPs in the same position have the same reference code (rs).
  • Checks if a SNP in the study has MAF < MAF_minimum, HWE < HWE_minimum and CR < CR_minimum if any of these criteria are met, the SNP is discarded. (MAF = Minor Allele Frequency, HWE = Hardy Weinberg Equilibrium, CR = Call Rate)

It generates a log file with all inconsistencies found: At the end of this file there is a summary of the problems found:

  • SNPs inverted: For Example A/G SNPs in reference , T/C SNPs in study
  • Allele problems: Number of SNPs with inconsistent alleles in study and in reference that could not be fixed with flipping
  • Position problems (different references, same loci): As it says. These SNPs are NOT removed. We keep the reference (rs number) of the reference panel
  • Unresolved single alleles problems: SNPs in study that have only one allele. These SNPs are filtered out.
  • Double rs codes problems: As it says. This SNPs are filtered out.
  • SNPs in study with MAF < MAF_minimum: SNPs with MAF < MAF_minimum set.
  • SNPs in study with HWE < HWE_minimum: SNPs with HWE < HWE_minimum set.
  • SNPs in study with CR < CR_minimum: SNPs with Call Rate < CR_minimum set
  • SNPs that differ in Allele Frequencies: SNPs with difference in AF between reference and study over CR_minimum set.


[edit] Parameters

<inputs>
</inputs>


[edit] Return

[edit] See also

[edit] Code

import sys

def Normalize_beagle_datasets_user_Kantale(
	input_beagle_filenames = None,
	input_markers_filenames = None,
	output_beagle_filenames = None,
	output_markers_filenames = None,
	output_markers_filename = None,
	missing = "0",
	MAF_minimum = 0.01,
	HWE_minimum =  0.0001,
	CR_minimum = 0.95,
	AF_difference = 0.25,
	input_types = None, #A list indicating whether input_beagle_filenames are "study" or "reference"
	log_filename = None,
):

	invert_dict = {"A":"T", "C":"G", "G":"C", "T":"A"}

	datasets = len(input_beagle_filenames)

	#Dictionary to store rs codes
	rs_codes = {}

	#Open all markers
	input_markers_files = [open(x) for x in input_markers_filenames]
	#open all markers
	input_beagle_files = [open(x) for x in input_beagle_filenames]

	#Open output files
	output_beagle_files = [open(x, "w") for x in output_beagle_filenames]
	output_markers_files = [open(x, "w") for x in output_markers_filenames]

	if log_filename:
		log = open(log_filename, "w")
	else:
		log = sys.stdout

	finished_files = [False] * datasets
	number_of_samples = [0] * datasets

	current_markers_lines_s = [x.readline().replace("\n", "").split() for x in input_markers_files]

	current_beagle_lines_s = [None] * len(input_beagle_filenames)
	for index in xrange(datasets):
		while True:
			line = input_beagle_files[index].readline()
			line_s = line.replace("\n", "").split()
			if line_s[0] == "M":
				current_beagle_lines_s[index] = list(line_s)
				number_of_samples[index] = len(line_s) - 2
				break
			else:
				output_beagle_files[index].write(line)

	lower_locus = min([int(current_markers_lines_s[x][1]) for x in range(datasets)])
	line_counter = 0
	problems_alleles = 0
	problems_positions = 0
	problems_single_allele = 0
	problems_double_rs = 0
	problems_too_low_maf = 0
	problems_too_low_hwe = 0
	problems_too_low_cr = 0
	problems_differ_afs = 0
	problems_SNP_not_in_reference = 0
	SNPs_inverted = 0
	while True:

		line_counter += 1

		MAFs = []	#Minor Allele Frequencies
		HWEs = []	#Hardy Weinberg Equilibriums
		CRs = []	#Call Rates

		if line_counter % 10000 == 0: 
			print "Lines:", line_counter, "  Current locus:", lower_locus, Print_now_user_Kantale()

		#Which do they have minimum locus?
		lower_locus_list = [x for x in range(datasets) if int(current_markers_lines_s[x][1]) == lower_locus]
		
		#print lower_locus_list, lower_locus

		#Take the first as reference:
		reference =    current_markers_lines_s[lower_locus_list[0]][2:4]
		reference_rs = current_markers_lines_s[lower_locus_list[0]][0]

		#Take the references of the rest
		references    = [current_markers_lines_s[x][2:4] for x in lower_locus_list]
		references_rs = [current_markers_lines_s[x][0] for x in lower_locus_list]

		#print "lower_locus:", lower_locus
		#print "lower_locus_list:", lower_locus_list

		lines_beagle = {}
		lines_markers = {}

		problem = False
		study_problem = False
		found_SNP_in_reference = False
		for x in range(datasets):
			
			MAFs += [None]
			HWEs += [None]
			CRs += [None]

			if x in lower_locus_list:

				x_index = lower_locus_list.index(x)

				#If this SNP is in reference we make a note
				if input_types[x] == "reference":
					found_SNP_in_reference = True

				#Detect single allele
				if references[x_index][0] == references[x_index][1]:
					if input_types[x] == "study":
						problem = True
						problems_single_allele += 1
						log.write("File: " + input_beagle_filenames[x] + "  Reference:" + reference_rs + "   Pos:" + str(lower_locus) + "  Single allele: " + references[x_index][0] + "\n")

				#The lines to be saved
				line_beagle = ["M", reference_rs]
				line_markers = [reference_rs, str(lower_locus)] + reference		


				if reference_rs != references_rs[x_index]:
					#This is just a warning. Finally we save the rs of the reference study for all files.
					log.write("Warning, Different references in same loci. Loci: " + str(lower_locus) + " File1: " + input_beagle_filenames[lower_locus_list[0]] + " reference:" + reference_rs + "  File2:" + input_beagle_filenames[x] + " reference:" + references_rs[x_index] + "\n")
					problems_positions += 1

				if reference[0] == references[x_index][0] and reference[1] == references[x_index][1]:
					line_beagle += current_beagle_lines_s[x][2:]
				elif reference[0] == references[x_index][1] and reference[1] == references[x_index][0]:
					line_beagle += current_beagle_lines_s[x][2:]
				elif reference[0] == invert_dict[references[x_index][0]] and reference[1] == invert_dict[references[x_index][1]]:
					SNPs_inverted += 1
					for y in current_beagle_lines_s[x][2:]:
						if y == missing: line_beagle += [missing]
						else: line_beagle += [invert_dict[y]]
				elif reference[0] == invert_dict[references[x_index][1]] and reference[1] == invert_dict[references[x_index][0]]:
					SNPs_inverted += 1
					for y in current_beagle_lines_s[x][2:]:
						if y == missing: line_beagle += [missing]
						else: line_beagle += [invert_dict[y]]
				else:
					log.write( "Pos: " + str(lower_locus) + "  Reference: " + reference_rs + "\n")
					log.write( "Filename: " + input_beagle_filenames[lower_locus_list[0]] + "  Alleles:" + reference[0] + reference[1] + "\n")
					log.write( "Filename: " + input_beagle_filenames[x] + "  Alleles:" + references[x_index][0] + references[x_index][1] + "\n")
					problems_alleles += 1
					problem = True
				
				#Estimate Minor Allele Frequency, Hardy Weinberg Equilibrium and Call Rate
				if not problem:
					observations_chr1 = [y for index, y in enumerate(current_beagle_lines_s[x][2:]) if index % 2 == 0]
					observations_chr2 = [y for index, y in enumerate(current_beagle_lines_s[x][2:]) if index % 2 == 1]
					allele1 = current_markers_lines_s[x][2]
					allele2 = current_markers_lines_s[x][3]
					observations = zip(observations_chr1, observations_chr2)

					MAFs[x] = Minor_allele_frequency_user_Kantale(
							allele1 = allele1,
							allele2 = allele2,
							observations_chr1 = observations_chr1,
							observations_chr2 = observations_chr2,
						)
					HWEs[x] = Hardy_Weinberg_Equilibrium_exact_test_user_Kantale(
							obs_hets = len([y for y in observations if (y[0] == allele1 and y[1] == allele2) or (y[0] == allele2 and y[1] == allele1)]),
							obs_hom1 = len([y for y in observations if y[0] == allele1 and y[1] == allele1]),
							obs_hom2 = len([y for y in observations if y[0] == allele2 and y[1] == allele2]),
						)
					CRs[x] = float(len(observations_chr1) + len(observations_chr2)) / float(len([y for y in observations_chr1 + observations_chr2 if y != missing]))
						
					if MAFs[x] < MAF_minimum and input_types[x] == "study":
						log.write( "Study: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + " with MAF: " + str(MAFs[x]) + " < " + str(MAF_minimum))
						study_problem = True
						problems_too_low_maf += 1
					elif HWEs[x] < HWE_minimum and input_types[x] == "study":
						log.write( "Study: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + " with HWE: " + str(HWEs[x]) + " < " + str(HWE_minimum))
						study_problem = True
						problems_too_low_hwe += 1
					elif CRs[x] < CR_minimum and input_types[x] == "study":
						log.write( "Study: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + " with CR: " + str(HWEs[x]) + " < " + str(CR_minimum))
						study_problem = True
						problems_too_low_cr += 1

				#Have we seen the reference before?
				if rs_codes.has_key(references_rs[x_index]):
					log.write( "Filename: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + "\n")
					log.write( "This rs code has been met before in POS:" + str(rs_codes[references_rs[x_index]]) + "\n")
					problem = True
					problems_double_rs += 1

				lines_beagle[x] = list(line_beagle)
				lines_markers[x] = list(line_markers)

		#Check difference in Allele Frequencies
		for study_maf in [MAFs[x] for x in range(datasets) if input_types[x] == "study" and MAFs[x]]:
			for reference_maf in [MAFs[y] for y in range(datasets) if input_types[y] == "reference" and MAFs[y]]:
				if abs(study_maf[0] - reference_maf[0]) > AF_difference:
					log.write( "Filename: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + "\n")
					log.write( "This SNP's Allele Frequency differes with Filename's: " + input_beagle_filenames[y] + " AF more than " + str(AF_difference) + "\n")
					problem = True
					problems_differ_afs += 1

		#Check if the SNP was found in the reference:
		if not found_SNP_in_reference:
			log.write("Filename: " + input_beagle_filenames[x] + " has the SNP:" + references_rs[x_index] + " in position: " + str(lower_locus) + "\n")
			log.write("This SNP does not exist in the reference\n")
			problem = True
			problems_SNP_not_in_reference += 1

		#Print lines
		for x in range(datasets):
			if x in lower_locus_list:
				
				#Is this SNP filtered from the study?
				if study_problem and input_types[x] == "study":
					pass
					
				#If this SNPS is problematic remove from both reference and study
				elif not problem:
					rs_codes[reference_rs] = lower_locus
					output_beagle_files[x].write(str.join(" ", lines_beagle[x]) + "\n")
					output_markers_files[x].write(str.join(" ", lines_markers[x]) + "\n")
					

		#For these in lower_locus find next line
		for x in lower_locus_list:
			if not finished_files[x]:
				new_line = input_markers_files[x].readline()
				if not new_line:
					finished_files[x] = True
					log.write( "Finished file:" + input_beagle_filenames[x] + "\n")
				else:
					current_markers_lines_s[x] = new_line.replace("\n", "").split()
					current_beagle_lines_s[x]  = input_beagle_files[x].readline().replace("\n", "").split()

		current_loci = [int(current_markers_lines_s[x][1]) for x in range(datasets) if not finished_files[x] ]
		if not current_loci:
			break
			
		lower_locus = min(current_loci)

	[x.close() for x in input_beagle_files]
	[x.close() for x in input_markers_files]
	[x.close() for x in output_beagle_files]
	[x.close() for x in output_markers_files]
#	output_markers_file.close()

	log.write( "SNPs inverted: " + str(SNPs_inverted) + "\n")
	log.write( "Allele problems: " + str(problems_alleles) + "\n")
	log.write( "Position problems (different references, same loci): " + str(problems_positions) + "\n")
	log.write( "Unresolved single alleles problems: " + str(problems_single_allele) + "\n")
	log.write( "Double rs codes problems: " + str(problems_double_rs) + "\n")
	log.write( "SNPs in study with MAF < MAF_minimum: " + str(problems_too_low_maf) + "\n")
	log.write( "SNPs in study with HWE < HWE_minimum: " + str(problems_too_low_hwe) + "\n")
	log.write( "SNPs in study with CR < CR_minimum: " + str(problems_too_low_cr) + "\n")
	log.write( "SNPs that differ in Allele Frequencies: " + str(problems_differ_afs) + "\n")
	log.write( "SNPs that are not present in reference: " + str(problems_SNP_not_in_reference) + "\n")

	if log_filename:
		log.close()

[edit] Unit Tests

def uni1():
	return True

[edit] Development Code

def Normalize_beagle_datasets_user_Kantale():
	pass

[edit] Permissions

[edit] Documentation Permissions

Kantale

[edit] Code Permissions

Kantale

[edit] Unit Tests Permissions

Kantale

[edit] Permissions Permissions

Kantale

Personal tools
Namespaces

Variants
Actions
Navigation
Toolbox