由於語法渲染問題而影響閱讀體驗, 請移步博客閱讀~
本文GitPage地址
Biopython
1. Quick Start
from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate## echo a seqmy_string = "GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG"##Revers complementreverse_complement(my_string)## { 'CTAACCAGCAGCACGACCACCCTTCCAACGACCCATAACAGC' }##DNA to RNAtranscribe(my_string)##--> { 'GCUGUUAUGGGUCGUUGGAAGGGUGGUCGUGCUGCUGGUUAG' }##RNA to DNAback_transcribe(my_string) --> { 'GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG' }##DNA to protinetranslate(my_string) --> { 'AVMGRWKGGRAAG*' }
2. Sequences
2.1 reading FASTA file
from Bio import SeqIOSeq1='GSE44995_Reference_assembled_isotig_seq.fna'for seq_record in SeqIO.parse(Seq1, "fasta"):print(seq_record.id)print(repr(seq_record.seq))print(len(seq_record.seq))## print by fasta formateSeq10=''for seq_record in SeqIO.parse(Seq1, "fasta"):if len(seq_record.seq) < 100:Seq10 += ">"+str(seq_record.id)+"\n"Seq10 += str(seq_record.seq)+"\n"print(seq_record.id)print(repr(seq_record.seq))print(len(seq_record.seq))
2.2 Seq run as string
from Bio.Seq import Seqfrom Bio.Alphabet import IUPACmy_seq = Seq("GATCG", IUPAC.unambiguous_dna)for index, letter in enumerate(my_seq):print("%i %s" % (index, letter))'''print result0 G1 A2 T3 C4 G'''print(my_seq[0]) #/ print(my_seq[-1])Seq("AAAA").count("AA")len(my_seq)## convert to strmy_seq = str(my_seq)
Result
46.875
<a name="lvo99"></a>### 2.4 Slicing a sequence<br />```pythonmy_seq[4:12]#####the first, second and third codon positions of this DNA sequence:#####my_seq[0::3] # Seq('GCTGTAGTAAG', IUPACUnambiguousDNA())my_seq[1::3] # Seq('AGGCATGCATC', IUPACUnambiguousDNA())my_seq[2::3] # Seq('TAGCTAAGAC', IUPACUnambiguousDNA())
2.5 revers string
2.6 Changing Font case
3 Bio-information
3.1 Revers Complement
Seq(“ACGTCGTAGCTAC”).complement() # standard example => output: Seq(‘TGCAGCATCGATG’)
Seq(“ACGTCGTAGCTAC”).reverse_complement() # output => Seq(‘GTAGCTACGACGT’)
<a name="o4Bl4"></a>### 3.2 Translation```pythonfrom Bio.Seq import Seqfrom Bio.Alphabet import IUPACSeq("UUU", IUPAC.unambiguous_rna).translate() # for RNASeq("TTT", IUPAC.unambiguous_dna).translate() # for DNA
3.3 Translation Tables
from Bio.Data import CodonTablestandard_table = CodonTable.unambiguous_dna_by_name['Standard']mito_table = CodonTable.unambiguous_dna_by_name['Vertebrate Mitochondrial']##same as:standard_table = CodonTable.unambiguous_dna_by_id[1]mito_table = CodonTable.unambiguous_dna_by_id[2]mito_table.stop_codons #--> { ['TAA', 'TAG', 'AGA', 'AGG'] }mito_table.start_codons # --> { ['ATT', 'ATC', 'ATA', 'ATG', 'GTG'] }mito_table.forward_table["ACG"] #--> { 'T' }print(standard_table)
| T | C | A | G |--+---------+---------+---------+---------+--T | TTT F | TCT S | TAT Y | TGT C | TT | TTC F | TCC S | TAC Y | TGC C | CT | TTA L | TCA S | TAA Stop| TGA Stop| AT | TTG L(s)| TCG S | TAG Stop| TGG W | G--+---------+---------+---------+---------+--C | CTT L | CCT P | CAT H | CGT R | TC | CTC L | CCC P | CAC H | CGC R | CC | CTA L | CCA P | CAA Q | CGA R | AC | CTG L(s)| CCG P | CAG Q | CGG R | G--+---------+---------+---------+---------+--A | ATT I | ACT T | AAT N | AGT S | TA | ATC I | ACC T | AAC N | AGC S | CA | ATA I | ACA T | AAA K | AGA R | AA | ATG M(s)| ACG T | AAG K | AGG R | G--+---------+---------+---------+---------+--G | GTT V | GCT A | GAT D | GGT G | TG | GTC V | GCC A | GAC D | GGC G | CG | GTA V | GCA A | GAA E | GGA G | AG | GTG V | GCG A | GAG E | GGG G | G--+---------+---------+---------+---------+--
4. Alignment
1. Echo a test file
## run in bashecho ">TRINITY_DN106095_c2_g1_i2MSRIMKVFLFLAVMVCISEAQLHAQCLCPRVRSRISSMTDIREVQIYEATIFCDRMEIVVTNDSGLRYCLNPKLKAVQKLLTAMKPKTSTTARPTVHSSSTGSTNTARM>TRINITY_DN92154_c0_g1_i1DIHVRRRTLTRSKTLGRSTNVNKMKLCILLMLGTLLVLVYGMPPISRDYNTHCRCLQVESRIIPPNSLKSIKLVPEGPHCPDMEVIAGLSNGEKVCLNPRSSWVKKLVNFVLEKQQGGALPKNQGQ" > test.fa
2. Align
## run in pythonfrom Bio import pairwise2from Bio.Seq import Seqfrom Bio.pairwise2 import format_alignmentfrom Bio.SubsMat import MatrixInfoseq1 = Seq("MSRIMKVFLFLAVMVCISEAQLHAQCLCPRVRSRISSMTDIREVQIYEATIFCDRMEIVVTNDSGLRYCLNPKLKAVQKLLTAMKPKTSTTARPTVHSSSTGSTNTARM")seq2 = Seq("DIHVRRRTLTRSKTLGRSTNVNKMKLCILLMLGTLLVLVYGMPPISRDYNTHCRCLQVESRIIPPNSLKSIKLVPEGPHCPDMEVIAGLSNGEKVCLNPRSSWVKKLVNFVLEKQQGGALPKNQGQ")alignments = pairwise2.align.globalxx(seq1, seq2)test_alignments = pairwise2.align.localds(seq1, seq2, MatrixInfo.blosum62, -10, -1)for alignment in alignments:print(format_alignment(*alignment))for alignment in test_alignments:print(format_alignment(*alignment))
SingleLetterAlphabet() alignment with 6 rows and 65 columnsMQNTPAERLPAIIEKAKSKHDINVWLLDRQGRDLLEQRVPAKVA...EGP B7RZ31_9GAMM/59-123AKQRGIAGLEEWLHRLDHSEAIPIFLIDEAGKDLLEREVPADIT...KKP A0A0C3NPG9_9PROT/58-119ARRHGQEYFQQWLERQPKKVKEQVFAVDQFGRELLGRPLPEDMA...KKP A0A143HL37_9GAMM/57-121TRRHGPESFRFWLERQPVEARDRIYAIDRSGAEILDRPIPRGMA...NKP A0A0X3UC67_9GAMM/57-121AINRNTQQLTQDLRAMPNWSLRFVYIVDRNNQDLLKRPLPPGIM...NRK B3PFT7_CELJU/62-126AVNATEREFTERIRTLPHWARRNVFVLDSQGFEIFDRELPSPVA...NRT K4KEM7_SIMAS/61-125
PDB file
Split the chain from PDF file
Cite: David Cain
import osfrom Bio import PDBclass ChainSplitter:def __init__(self, out_dir=None):""" Create parsing and writing objects, specify output directory. """self.parser = PDB.PDBParser()self.writer = PDB.PDBIO()if out_dir is None:out_dir = os.path.join(os.getcwd(), "chain_PDBs")self.out_dir = out_dirdef make_pdb(self, pdb_path, chain_letters, overwrite=False, struct=None):""" Create a new PDB file containing only the specified chains.Returns the path to the created file.:param pdb_path: full path to the crystal structure:param chain_letters: iterable of chain characters (case insensitive):param overwrite: write over the output file if it exists"""chain_letters = [chain.upper() for chain in chain_letters]# Input/output files(pdb_dir, pdb_fn) = os.path.split(pdb_path)pdb_id = pdb_fn[3:7]out_name = "pdb%s_%s.ent" % (pdb_id, "".join(chain_letters))out_path = os.path.join(self.out_dir, out_name)print ("OUT PATH:",out_path)plural = "s" if (len(chain_letters) > 1) else "" # for printing# Skip PDB generation if the file already existsif (not overwrite) and (os.path.isfile(out_path)):print("Chain%s %s of '%s' already extracted to '%s'." %(plural, ", ".join(chain_letters), pdb_id, out_name))return out_pathprint("Extracting chain%s %s from %s..." % (plural,", ".join(chain_letters), pdb_fn))# Get structure, write new file with only given chainsif struct is None:struct = self.parser.get_structure(pdb_id, pdb_path)self.writer.set_structure(struct)self.writer.save(out_path, select=SelectChains(chain_letters))return out_pathclass SelectChains(PDB.Select):""" Only accept the specified chains when saving. """def __init__(self, chain_letters):self.chain_letters = chain_lettersdef accept_chain(self, chain):return (chain.get_id() in self.chain_letters)if __name__ == "__main__":""" Parses PDB id's desired chains, and creates new PDB structures. """import sysif not len(sys.argv) == 2:print( "Usage: $ python %s 'pdb.txt'" % __file__)sys.exit()pdb_textfn = sys.argv[1]pdbList = PDB.PDBList()splitter = ChainSplitter("") # Change me.with open(pdb_textfn) as pdb_textfile:for line in pdb_textfile:pdb_id = line[:4].lower()chain = line[4]pdb_fn = pdbList.retrieve_pdb_file(pdb_id)splitter.make_pdb(pdb_fn, chain)
Another example:
This one works fine for me
from Bio.PDB import Select, PDBIOfrom Bio.PDB.PDBParser import PDBParserimport syspdb_file = sys.argv[1]class ChainSelect(Select):def __init__(self, chain):self.chain = chaindef accept_chain(self, chain):if chain.get_id() == self.chain:return 1else:return 0chains = ['A','B','C']p = PDBParser(PERMISSIVE=1)structure = p.get_structure(pdb_file, pdb_file)for chain in chains:pdb_chain_file = 'pdb_file_chain_{}.pdb'.format(chain)io_w_no_h = PDBIO()io_w_no_h.set_structure(structure)io_w_no_h.save('{}'.format(pdb_chain_file), ChainSelect(chain))
Extract fasta from PDB file
import sysfrom Bio import SeqIOPDBFile = sys.argv[1]with open(PDBFile, 'r') as pdb_file:for record in SeqIO.parse(pdb_file, 'pdb-atom'):print('>' + record.id)print(record.seq)
Enjoy~
由於語法渲染問題而影響閱讀體驗, 請移步博客閱讀~
本文GitPage地址
GitHub: Karobben
Blog:Karobben
BiliBili:史上最不正經的生物狗
