plasmid_seq system:sage {{{id=37| import csv /// }}} {{{id=55| import scipy.stats /// }}} {{{id=15| def count_kmers(read, k): counts = {} num_kmers = len(read) - k + 1 for i in range(num_kmers): kmer = read[i:i+k] if kmer not in counts: counts[kmer] = 0 counts[kmer] +=1 return counts /// }}} {{{id=16| def fre_kmers(read,k): counts=count_kmers(read, k) temp=0 for key in counts: temp+=counts[key] for key in counts: counts[key]=counts[key]/temp return counts /// }}} {{{id=17| def kmer_dis(read1,read2,k): fre1=fre_kmers(read1,k) fre2=fre_kmers(read2,k) temp=0 for key in fre1: if key in fre2: temp+=abs(fre1[key]-fre2[key]) else: temp+=fre1[key] for key in fre2: if key not in fre1: temp+=fre2[key] return temp /// }}} {{{id=48| def reverse_key(kmer_dic): new_dic={} for key in kmer_dic: new_key=key[::-1] new_dic[new_key]=kmer_dic[key] return new_dic /// }}} {{{id=49| def complement_key(kmer_dic): new_dic={} swap_dic={'A':'T','T':'A','G':'C','C':'G'} for key in kmer_dic: new_key='' for i in range(len(key)): new_key+=swap_dic[key[i]] new_dic[new_key]=kmer_dic[key] return new_dic /// }}} {{{id=25| def dic_L1(dic1,dic2): temp=0 for key in dic1: if key in dic2: temp+=abs(dic1[key]-dic2[key]) else: temp+=dic1[key] for key in dic2: if key not in dic1: temp+=dic2[key] return temp /// }}} {{{id=66| def dic_jacc(dic1,dic2): cap=0 cup=len(dic1) for key in dic2: if key in dic1: cap+=1 else: cup+=1 return 1-cap/cup /// }}} {{{id=50| def dic_L1_eq(dic1,dic2): d1=dic_L1(dic1,dic2) d2=dic_L1(dic1,complement_key(dic2)) d3=dic_L1(dic1,reverse_key(dic2)) d4=dic_L1(dic1,reverse_key(complement_key(dic2))) return min([d1,d2,d3,d4]) /// }}} {{{id=68| def dic_jacc_eq(dic1,dic2): d1=dic_jacc(dic1,dic2) d2=dic_jacc(dic1,complement_key(dic2)) d3=dic_jacc(dic1,reverse_key(dic2)) d4=dic_jacc(dic1,reverse_key(complement_key(dic2))) return min([d1,d2,d3,d4]) /// }}} {{{id=67| dic1={'ATG': 1/3, 'GAT': 1/3, 'TGA': 1/3} dic2={'ATG': 1/4, 'GAT': 1/2, 'TGA': 1/4} dic3={'AGT': 1/4, 'GTA': 1/4, 'TAG': 1/2} print dic_jacc_eq(dic1,dic3) /// 0 }}} {{{id=18| def pairwise_kmer_dis(read_list,k): dis_mat=[] fre_list=[] l=len(read_list) for i in range(l): fre_list.append(fre_kmers(read_list[i],k)) for i in range(l): temp=[] for j in range(l): temp.append(dic_L1_eq(fre_list[i],fre_list[j])) dis_mat.append(temp) matrix_plot(matrix(dis_mat), colorbar=True).show() return dis_mat /// }}} {{{id=69| def pairwise_jacc_dis(read_list,k): dis_mat=[] fre_list=[] l=len(read_list) for i in range(l): fre_list.append(fre_kmers(read_list[i],k)) for i in range(l): temp=[] for j in range(l): temp.append(dic_jacc_eq(fre_list[i],fre_list[j])) dis_mat.append(temp) matrix_plot(matrix(dis_mat), colorbar=True).show() return dis_mat /// }}} {{{id=1| def parse_fasta(fname): file=open('/Users/biocomplexity/Documents/plasmid/ST307/assemblies/'+fname+'.assembly_relabel.fasta','r') lines=file.readlines() out=[] identifier = 0 sequence = '' for line in lines: line = line.strip() if line.startswith(">"): if identifier==0: identifier = line else: out.append([identifier,sequence]) identifier = line sequence = '' else: sequence+=line out.append([identifier,sequence]) file.close() return out /// }}} {{{id=33| def sample_summary(fname,k): temp=parse_fasta(fname) readlist=[] for obj in temp: readlist.append(obj[1]) pairwise_kmer_dis(readlist,k) /// }}} {{{id=41| def get_contig(inc_type): output=[] pos=inc_type_dic[inc_type] for i in range(len(indicator)): if indicator[i][pos]==1: output.append(contig_list[i]) return output /// }}} {{{id=42| def get_read(contig): fname=contig[:5]+contig[8:12] read_list=parse_fasta(fname) temp=0 for i in range(len(read_list)): if contig==read_list[i][0][1:]: temp=read_list[i][1] return temp /// }}} {{{id=7| name_list=['49315__BK','49320__AM','49320__JP','49321__JP','50793__BK','50847__AM','50870__AM','50870__BK','50870__JP','50874__BK','50874__JP','50876__JP','50877__AM','50877__BK','50878__AM','50879__AM','50879__BK','50879__JP','50887__BK','50889__AM','50899__JP','50900__BK','50912__AM','50912__JP','50917__JP','50925__BK','50927__AM','50929__JP','50947__BK','50949__AM','50960__JP','50961__BK','50967__AM','50978__BK','50982__JP','51899__BK','51902__JP','51906__AM','51915__JP','51930__JP','56871__JP'] /// }}} {{{id=31| print sample_list /// ['49315', '49320', '49321', '50793', '50847', '50870', '50874', '50876', '50877', '50878', '50879', '50887', '50889', '50899', '50900', '50912', '50917', '50925', '50927', '50929', '50947', '50949', '50960', '50961', '50967', '50978', '50982', '51899', '51902', '51906', '51915', '51930', '56871'] }}} {{{id=20| sample_list=[] for name in name_list: if name[:5] not in sample_list: sample_list.append(name[:5]) /// }}} {{{id=9| sample_nd_dic={} for name in name_list: if name[:5] not in sample_nd_dic: sample_nd_dic[name[:5]]=0 sample_nd_dic[name[:5]]+=1 /// }}} {{{id=11| sample_pos_dic={} l=len(name_list) temp='xxxxxx' for i in range(l): if name_list[i][:5]!=temp: sample_pos_dic[name_list[i][:5]]=i temp=name_list[i][:5] /// }}} {{{id=34| name_nd_list=[] for key in sample_pos_dic: name_nd_list.append(name_list[sample_pos_dic[key]]) /// }}} {{{id=36| file=open('/Users/biocomplexity/Documents/plasmid/ST307/ST307.plasmidfinder_results.tsv','r') reader=csv.reader(file, delimiter='\t') matr=[] for row in reader: temp=[] for i in range(len(row)): temp.append(row[i]) matr.append(temp) file.close() /// }}} {{{id=38| inc_type_list=[] for i in range(184): if matr[i+1][2] not in inc_type_list: inc_type_list.append(matr[i+1][2]) inc_type_dic={} for i in range(18): inc_type_dic[inc_type_list[i]]=i /// }}} {{{id=39| contig_list=[] for i in range(184): if matr[i+1][5] not in contig_list: contig_list.append(matr[i+1][5]) contig_dic={} for i in range(110): contig_dic[contig_list[i]]=i /// }}} {{{id=40| indicator=[] for i in range(110): temp=[] for j in range(18): temp.append(0) indicator.append(temp) for row in matr[1:]: indicator[contig_dic[row[5]]][inc_type_dic[row[2]]]=1 /// }}} {{{id=57| sum([1,2,3]) /// 6 }}} {{{id=56| plasmid_list=[] for i in range(110): if sum(indicator[i])!=0: plasmid_list.append(contig_list[i]) /// }}} {{{id=58| len(plasmid_list) /// 110 }}} {{{id=52| len(incX3[0][1:]) /// 25 }}} {{{id=53| len(fibkX) /// 38 }}} {{{id=63| rank_X3 = 0 for obj in plasmid_total[0][1:]: if obj>0.46739: rank_X3+=1 /// }}} {{{id=64| rank_X3 /// 27 }}} {{{id=62| print float(mean(fibkX[0][1:])),float(std(fibkX[0][1:])) /// 0.270300131396 0.0186597741643 }}} {{{id=61| print float(median(incX3[0][1:])),float(std(incX3[0][1:])) /// 0.467382436024 0.0442171451418 }}} {{{id=60| print float(median(plasmid_total[0][1:])),float(std(plasmid_total[0][1:])) /// 0.41113922943 inf }}} {{{id=54| scipy.stats.mannwhitneyu(incX3[0][1:], plasmid_total[0][1:],alternative='two-sided') /// MannwhitneyuResult(statistic=1823.5, pvalue=0.011038213170522658) }}} {{{id=59| reads=[] reads.append(chrom_read_list[0]) for contig in plasmid_list: print contig reads.append(get_read(contig)) for read in reads: print len(read) plasmid_total=pairwise_kmer_dis(reads,5) /// WARNING: Output truncated! full_output.txt 49315__5__BK|78|circular 49315__4__BK|78|circular 49315__2__BK|78|circular 49315__3__BK|78|circular 49320__3__AM|I8|circular 49320__2__AM|I8|circular 49320__3__JP|I8|circular 49320__2__JP|I8|circular 49321__3__JP|I13|circular 49321__2__JP|I13|circular 50793__4__BK|KO1|circular 50793__3__BK|KO1|circular 50793__2__BK|KO1|circular 50847__3__AM|KO90|circular 50847__2__AM|KO90|circular 50870__5__AM|107_ESBL|circular 50870__4__AM|107_ESBL|circular 50870__2__AM|107_ESBL|circular 50870__3__AM|107_ESBL|circular 50870__6__BK|107_ESBL|circular 50870__4__BK|107_ESBL|circular 50870__2__BK|107_ESBL|circular 50870__3__BK|107_ESBL|circular 50870__8__JP|107_ESBL|circular 50870__4__JP|107_ESBL|circular 50870__2__JP|107_ESBL|circular 50870__3__JP|107_ESBL|linear 50874__4__BK|38_ESBL|circular 50874__3__BK|38_ESBL|circular 50874__2__BK|38_ESBL|circular 50874__4__JP|38_ESBL|circular 50874__3__JP|38_ESBL|circular 50874__2__JP|38_ESBL|circular 50876__5__JP||circular 50876__3__JP||circular 50876__4__JP||circular 50876__2__JP||linear 50877__4__AM|A05|circular 50877__2__AM|A05|circular 50877__5__AM|A05|circular 50877__4__BK|A05|circular 50877__2__BK|A05|circular 50877__5__BK|A05|circular 50878__6__AM|A21|circular 50878__4__AM|A21|circular 50878__2__AM|A21|linear 50879__3__AM|C6|circular 50879__2__AM|C6|linear 50879__3__BK|C6|circular 50879__2__BK|C6|linear 50879__2__JP|C6|linear 50887__6__BK|E23|circular 50887__3__BK|E23|circular 50887__2__BK|E23|circular 50889__3__AM|E30|circular 50889__2__AM|E30|circular 50899__3__JP|F20|circular 50899__2__JP|F20|circular 50900__3__BK|F25|linear ... 337117 2859 51479 193137 51479 192922 51479 192922 207780 51479 51479 185398 51479 185656 51479 193137 51479 192922 150996 51479 194831 51479 195566 150995 3549 51479 193007 51479 188774 3549 51479 192792 3549 51479 265406 3549 51479 192792 1551 9293 89609 191494 1551 9293 89609 191470 1551 9293 243050 1551 9293 248812 96097 231486 51484 195438 51479 191087 170141 435025 }}} {{{id=51| reads=[] reads.append(chrom_read_list[0]) for contig in get_contig('IncX3'): print contig reads.append(get_read(contig)) for read in reads: print len(read) incX3=pairwise_kmer_dis(reads,5) /// 49320__3__AM|I8|circular 49320__3__JP|I8|circular 49321__3__JP|I13|circular 50793__3__BK|KO1|circular 50847__3__AM|KO90|circular 50879__3__AM|C6|circular 50879__3__BK|C6|circular 50879__2__JP|C6|linear 50887__3__BK|E23|circular 50889__3__AM|E30|circular 50899__3__JP|F20|circular 50900__4__BK|F25|circular 50912__3__AM|G27|circular 50912__3__JP|G27|circular 50917__3__JP|G58|circular 50925__5__BK|H77|circular 50927__5__AM|L13|circular 50929__5__JP|L28|circular 50947__3__BK|I2|circular 50949__3__AM|I15|circular 50960__3__JP|K22|circular 50961__3__BK|K29|circular 50967__4__AM|K61|circular 51915__2__JP|NH25|linear 51930__5__JP|G13|circular 5309360 51307 51307 51477 51479 51479 95747 95747 337117 51479 51479 51479 51479 51479 51479 51479 51479 51479 51479 51479 51479 51479 51479 51479 51484 51479 }}} {{{id=43| reads=[] reads.append(chrom_read_list[0]) for contig in get_contig('IncFIB(K)'): print contig reads.append(get_read(contig)) for read in reads: print len(read) fibkX=pairwise_kmer_dis(reads,5) /// 49315__2__BK|78|circular 49320__2__AM|I8|circular 49320__2__JP|I8|circular 49321__2__JP|I13|circular 50793__2__BK|KO1|circular 50847__2__AM|KO90|circular 50870__2__AM|107_ESBL|circular 50870__2__BK|107_ESBL|circular 50870__2__JP|107_ESBL|circular 50874__2__BK|38_ESBL|circular 50874__2__JP|38_ESBL|circular 50876__2__JP||linear 50877__2__AM|A05|circular 50877__2__BK|A05|circular 50878__2__AM|A21|linear 50887__2__BK|E23|circular 50889__2__AM|E30|circular 50899__2__JP|F20|circular 50900__3__BK|F25|linear 50912__2__AM|G27|circular 50912__2__JP|G27|circular 50917__2__JP|G58|circular 50925__2__BK|H77|circular 50927__3__AM|L13|circular 50929__2__JP|L28|circular 50947__2__BK|I2|circular 50949__2__AM|I15|circular 50960__2__JP|K22|circular 50967__2__AM|K61|circular 50978__2__BK||circular 50982__2__JP|KV5|circular 51899__2__BK|IM18|circular 51902__2__JP|IM43|linear 51906__3__AM|IM68|circular 51915__3__JP|NH25|linear 51930__2__JP|G13|circular 56871__2__JP||circular 5309360 242625 206072 193137 184067 192792 192922 171075 171333 170776 243741 243692 279752 234446 221646 176630 193137 192922 192922 207780 185398 185656 193137 192922 194831 195566 193007 188774 192792 192792 191494 191470 243050 248812 231486 195438 191087 435025 }}} {{{id=65| for inc_name in inc_type_list: print inc_name reads=[] reads.append(chrom_read_list[0]) for contig in get_contig(inc_name): reads.append(get_read(contig)) print len(reads)-1 testabc=pairwise_kmer_dis(reads,5) print float(median(testabc[0][1:])) /// Col440I 7 0.612474055765 IncFIA(HI1) 5 0.317374731223 IncFIB(K) 37 0.266010412234 IncFIB(Mar) 8 0.495657507327 IncFII(K) 36 0.266002975077 IncR 5 0.317374731223 ColKP3 25 0.467304728399 IncX3 25 0.467382436024 Col(MG828) 10 0.776165880531 ColRNAI 11 0.41135453786 IncA/C2 1 0.397385179161 IncFII 4 0.267132662441 IncL/M(pOXA-48) 2 0.320168652211 IncFIB(pQil) 3 0.267090403177 IncHI1B 1 0.454951245412 FII(pBK30683) 2 0.239491605923 IncHI2 1 0.365804254081 IncHI2A 1 0.365804254081 }}} {{{id=47| reads=[] reads.append(chrom_read_list[0]) for contig in get_contig('IncFII'): print contig reads.append(get_read(contig)) for read in reads: print len(read) FII=pairwise_kmer_dis(reads,5) /// 50876__2__JP||linear 50925__3__BK|H77|circular 50929__3__JP|L28|circular 51930__3__JP|G13|circular 5309360 279752 150996 150995 170141 }}} {{{id=70| pairwise_jacc_dis(chrom_read_list,7) /// WARNING: Output truncated! full_output.txt [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/4096, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/8192, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/4096, 0, 0, 0, 0, 0, 0, 0, ... 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/4096, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/8192, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/4096, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1/8192, 0, 0, 0]] }}} {{{id=45| pairwise_kmer_dis(chrom_read_list,1) /// }}} {{{id=22| chrom_read_list=[] for obj in sample_list: fname=name_list[sample_pos_dic[obj]] temp=parse_fasta(fname) pos=-1 max_l=-1 for i in range(len(temp)): if max_l<=len(temp[i][1]): pos=i max_l=len(temp[i][1]) chrom_read_list.append(temp[pos][1]) print fname,max_l sample_summary(fname,5) /// 49315__BK 5309360 49320__AM 5433034 49321__JP 5490534 50793__BK 5429493 50847__AM 5433096 50870__AM 5320351 50874__BK 5305405 50876__JP 5461567 50877__AM 5369795 50878__AM 5322273 50879__AM 5431353 50887__BK 5426175 50889__AM 5430491 50899__JP 5426166 50900__BK 4891344 50912__AM 5367784 50917__JP 5427075 50925__BK 5441562 50927__AM 2892181 50929__JP 5438229 50947__BK 5429185 50949__AM 5410436 50960__JP 5423214 50961__BK 5430257 50967__AM 5282757 50978__BK 5356392 50982__JP 5310807 51899__BK 5303493 51902__JP 5298514 51906__AM 4306614 51915__JP 5434844 51930__JP 5430045 56871__JP 5215702 }}} {{{id=30| len(chrom_read_list) /// 33 }}} {{{id=27| fre_kmers(chrom_read_list[0],1) /// {'A': 564023/2654680, 'C': 1527531/5309360, 'G': 762369/2654680, 'T': 225809/1061872} }}} {{{id=26| len(chrom_read_list[0]) /// 5309360 }}} {{{id=23| len(chrom_read_list) /// 33 }}} {{{id=3| temp=parse_fasta('51906__AM') /// }}} {{{id=6| for obj in temp: print obj[0],len(obj[1]) /// >51906__1__AM|IM68|linear 4306614 >51906__2__AM|IM68|linear 711939 >51906__3__AM|IM68|circular 231486 >51906__4__AM|IM68|linear 231219 >51906__5__AM|IM68|linear 96097 >51906__6__AM|IM68|linear 12840 >51906__7__AM|IM68|linear 4075 }}} {{{id=4| sample_summary('51906__AM',5) /// Traceback (most recent call last): File "", line 1, in File "_sage_input_151.py", line 10, in exec compile(u'open("___code___.py","w").write("# -*- coding: utf-8 -*-\\n" + _support_.preparse_worksheet_cell(base64.b64decode("c2FtcGxlX3N1bW1hcnkoJzUxOTA2X19BTScsNSk="),globals())+"\\n"); execfile(os.path.abspath("___code___.py"))' + '\n', '', 'single') File "", line 1, in File "/private/var/folders/jh/vt78zj4d6ms09lxnw69g8jq00000gn/T/tmpS87UeL/___code___.py", line 3, in exec compile(u"sample_summary('51906__AM',_sage_const_5 )" + '\n', '', 'single') File "", line 1, in File "/private/var/folders/jh/vt78zj4d6ms09lxnw69g8jq00000gn/T/tmpcNt5sV/___code___.py", line 8, in sample_summary pairwise_kmer_dis(readlist,k) File "/private/var/folders/jh/vt78zj4d6ms09lxnw69g8jq00000gn/T/tmpnjSpjG/___code___.py", line 13, in pairwise_kmer_dis matrix_plot(matrix(dis_mat), colorbar=True).show() TypeError: 'list' object is not callable }}} {{{id=5| /// }}}