@lakesea 2020-11-20T07:24:23.000000Z 字数 5501 阅读 482

pesudo_molecule_generate

method

python ./multiple_to_single_fasta_v2.5.py -v novel -f original_contigs.novel.fa -s fasta -n 100 -r -

这会生成一个contigs gff : original_contigs.novel_contig.gff3 和更新的fasta文件: original_contigs.novel.fasta

#### multiple_to_single_fasta_v2.5.py
#!/usr/bin/env python
# Copyright (c) 2010 ACPFG, Universty of Queensland, Brisbane, QLD 4072, Australia.
# All rights reserved.
#
# Author: Michal Lorenc m.t.lorenc@wp.pl
#Test folder
#/scratch/uqmlore1/fasta $diff BA01_v4.0_CMAP.tsv /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0_CMAP.tsv
#uqmlore1@zeus: /scratch/uqmlore1/fasta $diff BA01_v4.0_contig.gff3 /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0_contig.gff3
#uqmlore1@zeus: /scratch/uqmlore1/fasta $diff BA01_v4.0.fasta /PROJ/jarrah/brassica/Versions/Bayer_A_scaffolds/V4.0/BA01_v4.0.fasta
#TODO
#replace optparse by argparser
# create two functions for creating concatanating seq and offset
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq 
import os
from optparse import OptionParser
if __name__ == '__main__':
  parser = OptionParser("multiple_to_single_fasta.py -v Chr1 -f t.m.fasta -s fasta -r r -n 100"+'\n'+
                        "multiple_to_single_fasta.py -v 7DS_PSMOL_0.3 -f t.m.fasta -s ACPFG_pseudomolecule -r - -n 2000")
  parser.add_option("-v", "--path", type="string", dest="psMolVer",
    help="Please give PSMOL name and version eg. 7DS_PSMOL_0.3 or chromosome eg. Chr1")
  parser.add_option("-f", "--fasta", type="string", dest="mFasta",
    help="Please give a multiple fasta files eg. t.m.fasta.")
  parser.add_option("-s", "--source", type="string", dest="source",
    help="Please give the source where the data comes from eg. ACPFG_pseudomolecule or fasta.")
  parser.add_option("-r", "--reverse", type="string", dest="reverse",
    help="Please specify what character is the orientation eg. r or -.")
  parser.add_option("-n", "--spacer", type="int", dest="spacerNo",
    help="Please give how many N do you want as spacer.")
  (opts, args) = parser.parse_args()
  if opts.psMolVer is None:
    print "Please give PSMOL name and version eg. 7DS_PSMOL_0.3 or chromosome eg. Chr1"
    parser.print_help()  
  elif opts.mFasta is None:
    print "Please give a multiple fasta files eg. t.m.fasta."
    parser.print_help()
  elif opts.source is None:
    print "Please give the source where the data comes from eg. ACPFG_pseudomolecule or fatsta."
    parser.print_help()        
  elif opts.reverse is None:
    print "Please specify what character is the orientation eg. r or -. If you do not use " +\
        "orientation use then ?"
    parser.print_help()
  elif opts.spacerNo is None:
    print "Please give how many N do you want as spacer."
    parser.print_help()        
  else:
    psmol_version = opts.psMolVer
    if opts.reverse == "-":
      basename = os.path.splitext(opts.mFasta)[0][0:-2]
    if opts.reverse == "?":
      basename = os.path.splitext(opts.mFasta)[0] + '_pseudo'
    else:
      basename = os.path.splitext(opts.mFasta)[0].replace("m", "", 1)
    output_single_fasta = os.path.join(os.path.dirname(basename), basename + ".fasta")
    output_gff = os.path.join(os.path.dirname(basename), basename + "_contig.gff3")
    output_tsv = os.path.join(os.path.dirname(basename), basename + "_CMAP.tsv")
    print "Input fasta file is ["+opts.mFasta+"]"
    print "Output fasta file is ["+output_single_fasta+"]"
  # Rip multiple fasta into Seq object list
    fa_parser = SeqIO.parse(open(opts.mFasta, "rU"), "fasta")
    psmol_sequence = ('N' * opts.spacerNo).join(str(rec.seq) for rec in fa_parser)
    seqRecord = (SeqRecord(Seq(psmol_sequence),id = opts.psMolVer,description = ""))
    outFastaFH = open(output_single_fasta,'w')
    SeqIO.write(seqRecord,outFastaFH,"fasta")
    outFastaFH.close()
  # Generate GFF and CMAP outputs
    print "Output GFF file is ["+output_gff+"]"
    print "Output CMAP file is ["+output_tsv+"]"
    gff_fh = open(output_gff,'w')
    cmap_fh = open(output_tsv,'w')
    gff_fh.write("##gff-version\t3\n##sequence-region\t"+opts.psMolVer+"\t1\t"+str(len(psmol_sequence))+"\n")
    cmap_fh.write("map_name\tmap_acc\tmap_display_order\tmap_start\tmap_stop\tfeature_acc\tfeature_name\tfeature_alt_name\tfeature_aliases\tfeature_start\tfeature_stop\tfeature_direction\tfeature_type_acc\tfeature_dbxref_name\tfeature_dbxref_url\tfeature_attributes\tis_landmark\n")
    Seq_start = 1
    Seq_end = 1
    for i, record in enumerate(SeqIO.parse(open(opts.mFasta), "fasta")):
      if i==0:
        Seq_end = len(record.seq)
      else:
        Seq_start = Seq_end+int(opts.spacerNo)+1
        Seq_end = Seq_start+len(record)-1
      if opts.reverse == "-":
        direction = record.id.split("|")[-1].rstrip()
        if direction == "1":
          strand = "+"
        elif direction == "-1":
          strand = "-"
      if opts.reverse == "?":
        strand = "."
        direction =  "."
      else:
        direction = record.id.split("_")[1][-1]
        if direction == "r":
          strand = "-"
        else:
          strand = "+"
      gff_fh.write(opts.psMolVer+"\t"+opts.source+"\tcontig\t"+str(Seq_start)+"\t"+str(Seq_end)+"\t.\t"+strand+"\t.\tID="+record.id+";Name="+record.id+"\n")
      cmap_fh.write(opts.psMolVer+"\t"+opts.psMolVer+"\t\t1\t"+str(len(psmol_sequence))+"\t"+record.id+"\t"+record.id+"\t\t\t"+str(Seq_start)+"\t"+str(Seq_end)+"\t"+direction+"\tsyntenic_block\t\t\t\t0\n")
    gff_fh.close()
    cmap_fh.close()

python makeSingleGff.py original_contigs.novel.fa > original_contigs.novel.gff3

### makeSingleGff.py
import sys
from Bio import SeqIO
for s in SeqIO.parse(sys.argv[1], 'fasta'):
    thisid = s.id
    source = '.'
    thistype ='contig'
    start = 1
    end = len(s)+1
    score = '.'
    strand = '+'
    phase = '.'
    attributes = 'ID=%s'%s.id
    print('\t'.join(map(str, [thisid, source,thistype, start, end, score, strand, phase ,attributes])))

### original_contigs.novel.gff3
scf7180002493335        .       contig  1       505     .       +       .       ID=scf7180002493335
scf7180002495135        .       contig  1       1293    .       +       .       ID=scf7180002495135
scf7180002499243        .       contig  1       723     .       +       .       ID=scf7180002499243
scf7180002499304        .       contig  1       541     .       +       .       ID=scf7180002499304
scf7180002502282        .       contig  1       706     .       +       .       ID=scf7180002502282
perl /group/pawsey0149/groupEnv/ivec/app/bioscripts/gbrowse/offset_feature_coordinate_old2new_ref.pl -oldref original_contigs.novel.gff3 -newref original_contigs.novel_contig.gff3 original_novel.sort.final.gff
###
###
###this will generate a original_novel.sort.final_newRef.gff file

pesudo_molecule_generate

这会生成一个contigs gff : original_contigs.novel_contig.gff3 和 更新的fasta文件: original_contigs.novel.fasta

内容目录

这会生成一个contigs gff : original_contigs.novel_contig.gff3 和更新的fasta文件: original_contigs.novel.fasta