@lakesea
2020-11-20T07:24:23.000000Z
字数 5501
阅读 482
method
python ./multiple_to_single_fasta_v2.5.py -v novel -f original_contigs.novel.fa -s fasta -n 100 -r -
#### multiple_to_single_fasta_v2.5.py#!/usr/bin/env python# Copyright (c) 2010 ACPFG, Universty of Queensland, Brisbane, QLD 4072, Australia.# All rights reserved.## Author: Michal Lorenc m.t.lorenc@wp.pl#Test folder#/scratch/uqmlore1/fasta $diff BA01_v4.0_CMAP.tsv /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0_CMAP.tsv#uqmlore1@zeus: /scratch/uqmlore1/fasta $diff BA01_v4.0_contig.gff3 /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0_contig.gff3#uqmlore1@zeus: /scratch/uqmlore1/fasta $diff BA01_v4.0.fasta /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0.fasta#TODO#replace optparse by argparser# create two functions for creating concatanating seq and offsetfrom Bio import SeqIOfrom Bio.SeqRecord import SeqRecordfrom Bio.Seq import Seqimport osfrom optparse import OptionParserif __name__ == '__main__':parser = OptionParser("multiple_to_single_fasta.py -v Chr1 -f t.m.fasta -s fasta -r r -n 100"+'\n'+"multiple_to_single_fasta.py -v 7DS_PSMOL_0.3 -f t.m.fasta -s ACPFG_pseudomolecule -r - -n 2000")parser.add_option("-v", "--path", type="string", dest="psMolVer",help="Please give PSMOL name and version eg. 7DS_PSMOL_0.3 or chromosome eg. Chr1")parser.add_option("-f", "--fasta", type="string", dest="mFasta",help="Please give a multiple fasta files eg. t.m.fasta.")parser.add_option("-s", "--source", type="string", dest="source",help="Please give the source where the data comes from eg. ACPFG_pseudomolecule or fasta.")parser.add_option("-r", "--reverse", type="string", dest="reverse",help="Please specify what character is the orientation eg. r or -.")parser.add_option("-n", "--spacer", type="int", dest="spacerNo",help="Please give how many N do you want as spacer.")(opts, args) = parser.parse_args()if opts.psMolVer is None:print "Please give PSMOL name and version eg. 7DS_PSMOL_0.3 or chromosome eg. Chr1"parser.print_help()elif opts.mFasta is None:print "Please give a multiple fasta files eg. t.m.fasta."parser.print_help()elif opts.source is None:print "Please give the source where the data comes from eg. ACPFG_pseudomolecule or fatsta."parser.print_help()elif opts.reverse is None:print "Please specify what character is the orientation eg. r or -. If you do not use " +\"orientation use then ?"parser.print_help()elif opts.spacerNo is None:print "Please give how many N do you want as spacer."parser.print_help()else:psmol_version = opts.psMolVerif opts.reverse == "-":basename = os.path.splitext(opts.mFasta)[0][0:-2]if opts.reverse == "?":basename = os.path.splitext(opts.mFasta)[0] + '_pseudo'else:basename = os.path.splitext(opts.mFasta)[0].replace("m", "", 1)output_single_fasta = os.path.join(os.path.dirname(basename), basename + ".fasta")output_gff = os.path.join(os.path.dirname(basename), basename + "_contig.gff3")output_tsv = os.path.join(os.path.dirname(basename), basename + "_CMAP.tsv")print "Input fasta file is ["+opts.mFasta+"]"print "Output fasta file is ["+output_single_fasta+"]"# Rip multiple fasta into Seq object listfa_parser = SeqIO.parse(open(opts.mFasta, "rU"), "fasta")psmol_sequence = ('N' * opts.spacerNo).join(str(rec.seq) for rec in fa_parser)seqRecord = (SeqRecord(Seq(psmol_sequence),id = opts.psMolVer,description = ""))outFastaFH = open(output_single_fasta,'w')SeqIO.write(seqRecord,outFastaFH,"fasta")outFastaFH.close()# Generate GFF and CMAP outputsprint "Output GFF file is ["+output_gff+"]"print "Output CMAP file is ["+output_tsv+"]"gff_fh = open(output_gff,'w')cmap_fh = open(output_tsv,'w')gff_fh.write("##gff-version\t3\n##sequence-region\t"+opts.psMolVer+"\t1\t"+str(len(psmol_sequence))+"\n")cmap_fh.write("map_name\tmap_acc\tmap_display_order\tmap_start\tmap_stop\tfeature_acc\tfeature_name\tfeature_alt_name\tfeature_aliases\tfeature_start\tfeature_stop\tfeature_direction\tfeature_type_acc\tfeature_dbxref_name\tfeature_dbxref_url\tfeature_attributes\tis_landmark\n")Seq_start = 1Seq_end = 1for i, record in enumerate(SeqIO.parse(open(opts.mFasta), "fasta")):if i==0:Seq_end = len(record.seq)else:Seq_start = Seq_end+int(opts.spacerNo)+1Seq_end = Seq_start+len(record)-1if opts.reverse == "-":direction = record.id.split("|")[-1].rstrip()if direction == "1":strand = "+"elif direction == "-1":strand = "-"if opts.reverse == "?":strand = "."direction = "."else:direction = record.id.split("_")[1][-1]if direction == "r":strand = "-"else:strand = "+"gff_fh.write(opts.psMolVer+"\t"+opts.source+"\tcontig\t"+str(Seq_start)+"\t"+str(Seq_end)+"\t.\t"+strand+"\t.\tID="+record.id+";Name="+record.id+"\n")cmap_fh.write(opts.psMolVer+"\t"+opts.psMolVer+"\t\t1\t"+str(len(psmol_sequence))+"\t"+record.id+"\t"+record.id+"\t\t\t"+str(Seq_start)+"\t"+str(Seq_end)+"\t"+direction+"\tsyntenic_block\t\t\t\t0\n")gff_fh.close()cmap_fh.close()
python makeSingleGff.py original_contigs.novel.fa > original_contigs.novel.gff3
### makeSingleGff.pyimport sysfrom Bio import SeqIOfor s in SeqIO.parse(sys.argv[1], 'fasta'):thisid = s.idsource = '.'thistype ='contig'start = 1end = len(s)+1score = '.'strand = '+'phase = '.'attributes = 'ID=%s'%s.idprint('\t'.join(map(str, [thisid, source,thistype, start, end, score, strand, phase ,attributes])))
### original_contigs.novel.gff3scf7180002493335 . contig 1 505 . + . ID=scf7180002493335scf7180002495135 . contig 1 1293 . + . ID=scf7180002495135scf7180002499243 . contig 1 723 . + . ID=scf7180002499243scf7180002499304 . contig 1 541 . + . ID=scf7180002499304scf7180002502282 . contig 1 706 . + . ID=scf7180002502282perl /group/pawsey0149/groupEnv/ivec/app/bioscripts/gbrowse/offset_feature_coordinate_old2new_ref.pl -oldref original_contigs.novel.gff3 -newref original_contigs.novel_contig.gff3 original_novel.sort.final.gff#########this will generate a original_novel.sort.final_newRef.gff file