Convert from VCF to mach user Kantale

From PyPedia
Jump to: navigation, search

Contents

[edit] Documentation

Convert phased data from VCF to Mach format. Attention: The VCF file should contain data from only one chromosome. To extract a chromosome from a VCF file use: VCFTools


[edit] Parameters

Input VCF filename:
Output hap filename:
Output map filename:
The chromosome that this VCF file contains:

<inputs>
<param name="input_VCF_filename" type="data" value="" label="Input VCF filename: "/>
<param name="output_mach_hap_filename" type="data" value="" label="Output hap filename: "/>
<param name="output_mach_map_filename" type="data" value="" label="Output map filename: "/>
<param name="chromosome" type="data" value="" label="The chromosome that this VCF file contains: "/>
</inputs>


[edit] Return

[edit] See also

[edit] Code

def Convert_from_VCF_to_mach_user_Kantale(
	input_VCF_filename = None,
	output_mach_hap_filename = None,
	output_mach_map_filename = None,
	chromosome = None,
):

	input_VCF_file = open(input_VCF_filename)

	if not chromosome:
		raise Exception("Undefined parameter: chromosome")

	print "Reading input files..", Print_now_user_Kantale()
	output_mach_map_file = open(output_mach_map_filename, "w")

	lineno = 0
	invalid_columns = 0
	no_GT = 0
	while True:
		line = input_VCF_file.readline()
		if not line:
			break

		lineno += 1

		if lineno % 10000 == 0:
			print "Line:", lineno, Print_now_user_Kantale()

		line_s = line.replace("\n", "").split()
		
		if line_s[0] == "#CHROM":
			samples = line_s[9:]
			haplotypes = {}


		if line_s[0][0] != "#":

			reference   = line_s[3]
			alternative = line_s[4]
			allele_table = list(reference + alternative)

			rs_code = line_s[2]
			position = line_s[1]
			if rs_code == ".":
				rs_code = str(chromosome) + "-" + position

			map_line = [str(chromosome), rs_code, position]

	
			format = line_s[8]
			format_s = format.split(":")
			format_index = -1
			for index, field in enumerate(format_s):
				if field == "GT":
					format_index = index
					break
			if format_index < 0:
				print "Warning: No GT field found on format. Line: ", lineno
				no_GT += 1
				continue

			current_samples = line_s[9:]
			if len(current_samples) != len(samples):
				print "Warning: Ignoring line", lineno, ". It has ", len(current_samples), " samples whereas header has ", len(samples)
				invalid_columns += 1
				continue

			#Saving the map line
			output_mach_map_file.write(str.join("\t", map_line) + "\n")

			for sample_index, sample_field in enumerate(current_samples):
				sample_field_s = sample_field.split(":")
				genotype = sample_field_s[format_index]
				genotype_s = genotype.split("|")
				if len(genotype_s) == 1:
					#This is not a phased genotype. Ignore it
					pass
				else:
					#print samples
					if not haplotypes.has_key(samples[sample_index]):
						haplotypes[samples[sample_index]] = [[], []]
					for gen_index, gen_value in enumerate(genotype_s):
						if gen_value == ".":
							raise Exception("'.' detected. Cannot handle unknown values in MACH format. Line:" + str(lineno))
						else:
							this_allele = allele_table[int(gen_value)]
							haplotypes[samples[sample_index]][gen_index] += [this_allele]

	print "..Done", Print_now_user_Kantale()

	input_VCF_file.close()
	output_mach_map_file.close()

	print "Save hap file.."
	output_mach_hap_file = open(output_mach_hap_filename, "w")
	for sample_index, sample in enumerate(haplotypes):
		print "Saving Sample:", sample, (sample_index+1), '/', len(haplotypes), Print_now_user_Kantale()
		for hap_index, hap in enumerate(["HAPL01", "HAPL02"]):
			output_mach_hap_file.write(str.join("\t", [sample, hap, str.join("", haplotypes[sample][hap_index])]) + "\n")
	print "..Done"
	output_mach_hap_file.close()

	if invalid_columns: print "Lines with invalid sample columns:", invalid_columns
	if no_GT: print "Lines with no GT field:", no_GT

[edit] Unit Tests

def uni1():
	return True

[edit] Development Code

def Convert_from_VCF_to_mach_user_Kantale():
	pass

[edit] Permissions

[edit] Documentation Permissions

Kantale

[edit] Code Permissions

Kantale

[edit] Unit Tests Permissions

Kantale

[edit] Permissions Permissions

Kantale

Personal tools
Namespaces

Variants
Actions
Navigation
Toolbox