Assembly impute2 gprobs bins user Kantale

From PyPedia
Jump to: navigation, search

Contents

[edit] Documentation

Assembly gprobs files from multiple imputation pipelines from several bins into one file. This method is designed to assembly gprobs files created by the Imputation_pipeline_user_Kantale.

[edit] Parameters

<inputs>
</inputs>

[edit] Return

[edit] See also

Imputation_pipeline_user_Kantale

[edit] Code

def Assembly_impute2_gprobs_bins_user_Kantale(
	bins_filenames = None,
	bin_sizes = None, #Array of size of each bin
	tfam_filename = None,
	gprobs_filenames = None,
	impute2_info_filenames = None,
	output_filename = None,
	output_info_filename = None,
):
	#Read tfam
	if tfam_filename:
		tfam = open(tfam_filename).readlines()

		#Read bins
		print "Reading indexed.."
		indexes = []
		bin_sizes = [0] * len(bins_filenames)
		for bin_index, bin_filename in enumerate(bins_filenames):
			bin_file = open(bin_filename)
			bin_file_lines = bin_file.readlines()
			bin_sizes[bin_index] = len(bin_file_lines)
			for bin_file_line in bin_file_lines:
				indexes += [tfam.index(bin_file_line)]
			bin_file.close()
		total_samples = sum(bin_sizes)
		print "..done reading indexes"
	else:
		total_samples = sum(bin_sizes)
		indexes = range(total_samples)


	
		

	#Open output file
	output_file = open(output_filename, "w")
	
	#Open output info filename
	if output_info_filename:
		output_info_file = open(output_info_filename, "w")

	#Open all grobs files
	gprobs_files = [open(filename) for filename in gprobs_filenames]
	
	#Open impute2 info files
	if impute2_info_filenames:
		impute2_info_files = [open(filename) for filename in impute2_info_filenames]
	
	lines = 0
	snp_found_problems = 0
	snp_bin_problems = 0
	goto_next_line = [True] * len(gprobs_files)
	gprobs_files_lines = [None] * len(gprobs_files)
	impute2_info_files_lines = [None] * len(gprobs_files)
	while True:
		#Read lines from all files in goto_next_line
		for index in range(len(gprobs_files)):
			if goto_next_line[index]:
				gprobs_files_lines[index] = gprobs_files[index].readline()
				if impute2_info_filenames:
					impute2_info_files_lines[index] = impute2_info_files[index].readline()

		goto_next_line = [True] * len(gprobs_files)

		if all(not x for x in gprobs_files_lines):
			break

		gprobs_files_ls = [line.replace("\n", "").split() for line in gprobs_files_lines]
		if impute2_info_filenames:
			info_files_ls = [line.replace("\n", "").split() for line in impute2_info_files_lines]
		lines += 1

		if lines % 1000 == 0:
			print "Lines: ", lines, Print_now_user_Kantale()

		#Take the first 4 columns
		signatures = [ls[0:5] for ls in gprobs_files_ls]

		#All these should be the same
		if len(set([str(x) for x in signatures])) > 1:
			#If not..

			#Check panels
			sig_1 = [ls[1:5] for ls in gprobs_files_ls]
			if len(set([str(x) for x in sig_1])) == 1:
				print "Warning: A SNP is found in original study, only for part of the bins: " + str(signatures)
				snp_found_problems += 1
				continue

			#Check positions
			sig_2 = [int(x[2]) for x in signatures]
			if len(set(sig_2)) > 1:
				print "Warning: A SNP is not present in all bins: " + str(signatures)
				snp_bin_problems += 1
				#Find the lower and move to the next line
				minimum_pos = min(sig_2)
				goto_next_line = [x == minimum_pos for x in sig_2]
				continue

			#Unknown error..
			raise Exception("Unknown error in SNP signatures:", str(signatures))

		composite_line = reduce(lambda x,y: x+ y, [x[5:] for x in gprobs_files_ls])		

		to_print = signatures[0]
		for index in indexes:
			to_print += composite_line[(index*3):(index*3)+3]

		output_file.write(str.join(" ", to_print) + "\n")
		
		if output_info_filename:
			common_r2 = sum([(bin_sizes[info_index] * float(x[5])) / total_samples for info_index, x in enumerate(info_files_ls)])
			output_info_file.write(str.join(" ", info_files_ls[0][0:5] + [str(common_r2)] + info_files_ls[0][6:] ) + "\n")

	#Close all gprobs files
	[file.close() for file in gprobs_files]

	#Close output file
	output_file.close()
	
	if output_info_filename:
		output_info_file.close()

	print "SNPs found in study for part of the bins:", snp_found_problems
	print "SNPs not found in all the bins:", snp_bin_problems

[edit] Unit Tests

def uni1():
	return True

[edit] Development Code

def Assembly_impute2_gprobs_bins_user_Kantale():
	pass

[edit] Permissions

[edit] Documentation Permissions

Kantale

[edit] Code Permissions

Kantale

[edit] Unit Tests Permissions

Kantale

[edit] Permissions Permissions

Kantale

Personal tools
Namespaces

Variants
Actions
Navigation
Toolbox