Convert tab file to VCF user Kantale

From PyPedia
Jump to: navigation, search

Contents

[edit] Documentation

Converts a tabular file into VCF format (http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40). The dictionary tab_format contains the column indexes and potential lambda function for column processing.

Example:

For the file:

chr10	83945	G	R	99	G	35	50	73	A	32	28	36	109	0.0418952	1.55963	1	73
chr10	84018	T	Y	47	T	35	33	36	C	36	5	5	41	0.725034	1.12195	1	73
chr10	84545	C	Y	99	C	35	16	16	T	33	15	15	31	0.730608	1.00000	1	527
chr10	272897	C	T	85	T	35	22	22	C	0	0	0	22	1.00000	1.00000	1	2303
chr10	282927	C	A	83	A	36	23	23	C	0	0	0	23	1.00000	1.00000	1	1218

Use the following code

import pypedia
from pypedia import Convert_tab_file_to_VCF

tab_format = {"chromosome" : 0, "position" :1, "rs":None, "reference" : 2, "chr1": 5, "chr2" : 9}

return Convert_tab_file_to_VCF_user_Kantale("INPUT_TAB_FILE", tab_format)

if the tab_format is omitted then the following format is assumed:

Chromosome   position   rs_code   reference   chr1   chr2

In that case, an example input could be:

10	83945	rs1	G	A	A
10	83361	rs2	G	G	A
10	89253	rs3	G	G	G

This method generates a vcf file withe a single sample.



[edit] Parameters

Input tab separated file
Output VCF filename

<inputs>
<param name="input_filename" value="" type="data" label="Input tab separated file "/>
<param name="output_filename" value="" type="data" label="Output VCF filename "/>
</inputs>


[edit] Return

The output filename

[edit] See also

[edit] Code

import os

def Convert_tab_file_to_VCF_user_Kantale(
	input_filename=None, 
	tab_format=None,
	output_filename=None
	):

	if not tab_format:
		tab_format = {"chromosome" : 0, "position" :1, "rs": 2, "reference" : 3, "chr1": 4, "chr2" : 5}

	header, header_str = Default_VCF_header_user_Kantale(None, [input_filename])
	
	file = open(input_filename)
	output = open(output_filename, "w")

	output.write(header_str)

	currentLine = 0
	while True:
		line = file.readline()
		if not line: 
			break
	
		currentLine += 1
		if currentLine % 10000 == 0:
			print "Lines:", currentLine, Print_now_user_Kantale()
		toPrint = []

		if line.strip() == "":
			continue

		lineSplitted = line.replace("\n", "").split()
		
		toPrint += [lineSplitted[tab_format["chromosome"]].replace("chr", "")] #chromosome
		toPrint += [lineSplitted[tab_format["position"]]] #position

		if not tab_format.has_key("rs"):
			toPrint += ["."]
		else:
			toPrint += [lineSplitted[tab_format["rs"]]] #ID 

		if tab_format.has_key("reference"):
			ref = lineSplitted[tab_format["reference"]]
		else:
			ref = "A"
		
		toPrint += [ref] # reference

		if tab_format.has_key("chr1"):
			alt1 = lineSplitted[tab_format["chr1"]]
		else:
			alt1 = "C"
		if tab_format.has_key("chr2"):
			alt2 = lineSplitted[tab_format["chr2"]]
		else:
			alt2 = "C"
		
		alternativeToPrint = []
		if alt1 != ref: alternativeToPrint += [alt1] #Alternative
		if alt2 not in [alt1, ref]: alternativeToPrint += [alt2] #Alternative
	
		if len(alternativeToPrint) == 0: 
			toPrint += ['.']
		else:
			toPrint += [str.join(",", alternativeToPrint)] #Alternative

		toPrint += ["100", "Q20", "NS=1", "GT"]
	
		genotype = ("0" if alt1 == ref else "1") + "/" + ("0" if alt2 == ref else "1")
		toPrint += [genotype]	#
	
		output.write( str.join("\t", toPrint) + "\n")

	file.close()
	output.close()
	
	return output_filename

[edit] Unit Tests

def uni1():
	return True

[edit] Development Code

def Convert_tab_file_to_VCF_user_Kantale():
	pass

[edit] Permissions

[edit] Documentation Permissions

Kantale

[edit] Code Permissions

Kantale

[edit] Unit Tests Permissions

Kantale

[edit] Permissions Permissions

Kantale

Personal tools
Namespaces

Variants
Actions
Navigation
Toolbox