plasmid_seq
system:sage
{{{id=37|
import csv
///
}}}
{{{id=55|
import scipy.stats
///
}}}
{{{id=15|
def count_kmers(read, k):
counts = {}
num_kmers = len(read) - k + 1
for i in range(num_kmers):
kmer = read[i:i+k]
if kmer not in counts:
counts[kmer] = 0
counts[kmer] +=1
return counts
///
}}}
{{{id=16|
def fre_kmers(read,k):
counts=count_kmers(read, k)
temp=0
for key in counts:
temp+=counts[key]
for key in counts:
counts[key]=counts[key]/temp
return counts
///
}}}
{{{id=17|
def kmer_dis(read1,read2,k):
fre1=fre_kmers(read1,k)
fre2=fre_kmers(read2,k)
temp=0
for key in fre1:
if key in fre2:
temp+=abs(fre1[key]-fre2[key])
else:
temp+=fre1[key]
for key in fre2:
if key not in fre1:
temp+=fre2[key]
return temp
///
}}}
{{{id=48|
def reverse_key(kmer_dic):
new_dic={}
for key in kmer_dic:
new_key=key[::-1]
new_dic[new_key]=kmer_dic[key]
return new_dic
///
}}}
{{{id=49|
def complement_key(kmer_dic):
new_dic={}
swap_dic={'A':'T','T':'A','G':'C','C':'G'}
for key in kmer_dic:
new_key=''
for i in range(len(key)):
new_key+=swap_dic[key[i]]
new_dic[new_key]=kmer_dic[key]
return new_dic
///
}}}
{{{id=25|
def dic_L1(dic1,dic2):
temp=0
for key in dic1:
if key in dic2:
temp+=abs(dic1[key]-dic2[key])
else:
temp+=dic1[key]
for key in dic2:
if key not in dic1:
temp+=dic2[key]
return temp
///
}}}
{{{id=66|
def dic_jacc(dic1,dic2):
cap=0
cup=len(dic1)
for key in dic2:
if key in dic1:
cap+=1
else:
cup+=1
return 1-cap/cup
///
}}}
{{{id=50|
def dic_L1_eq(dic1,dic2):
d1=dic_L1(dic1,dic2)
d2=dic_L1(dic1,complement_key(dic2))
d3=dic_L1(dic1,reverse_key(dic2))
d4=dic_L1(dic1,reverse_key(complement_key(dic2)))
return min([d1,d2,d3,d4])
///
}}}
{{{id=68|
def dic_jacc_eq(dic1,dic2):
d1=dic_jacc(dic1,dic2)
d2=dic_jacc(dic1,complement_key(dic2))
d3=dic_jacc(dic1,reverse_key(dic2))
d4=dic_jacc(dic1,reverse_key(complement_key(dic2)))
return min([d1,d2,d3,d4])
///
}}}
{{{id=67|
dic1={'ATG': 1/3, 'GAT': 1/3, 'TGA': 1/3}
dic2={'ATG': 1/4, 'GAT': 1/2, 'TGA': 1/4}
dic3={'AGT': 1/4, 'GTA': 1/4, 'TAG': 1/2}
print dic_jacc_eq(dic1,dic3)
///
0
}}}
{{{id=18|
def pairwise_kmer_dis(read_list,k):
dis_mat=[]
fre_list=[]
l=len(read_list)
for i in range(l):
fre_list.append(fre_kmers(read_list[i],k))
for i in range(l):
temp=[]
for j in range(l):
temp.append(dic_L1_eq(fre_list[i],fre_list[j]))
dis_mat.append(temp)
matrix_plot(matrix(dis_mat), colorbar=True).show()
return dis_mat
///
}}}
{{{id=69|
def pairwise_jacc_dis(read_list,k):
dis_mat=[]
fre_list=[]
l=len(read_list)
for i in range(l):
fre_list.append(fre_kmers(read_list[i],k))
for i in range(l):
temp=[]
for j in range(l):
temp.append(dic_jacc_eq(fre_list[i],fre_list[j]))
dis_mat.append(temp)
matrix_plot(matrix(dis_mat), colorbar=True).show()
return dis_mat
///
}}}
{{{id=1|
def parse_fasta(fname):
file=open('/Users/biocomplexity/Documents/plasmid/ST307/assemblies/'+fname+'.assembly_relabel.fasta','r')
lines=file.readlines()
out=[]
identifier = 0
sequence = ''
for line in lines:
line = line.strip()
if line.startswith(">"):
if identifier==0:
identifier = line
else:
out.append([identifier,sequence])
identifier = line
sequence = ''
else:
sequence+=line
out.append([identifier,sequence])
file.close()
return out
///
}}}
{{{id=33|
def sample_summary(fname,k):
temp=parse_fasta(fname)
readlist=[]
for obj in temp:
readlist.append(obj[1])
pairwise_kmer_dis(readlist,k)
///
}}}
{{{id=41|
def get_contig(inc_type):
output=[]
pos=inc_type_dic[inc_type]
for i in range(len(indicator)):
if indicator[i][pos]==1:
output.append(contig_list[i])
return output
///
}}}
{{{id=42|
def get_read(contig):
fname=contig[:5]+contig[8:12]
read_list=parse_fasta(fname)
temp=0
for i in range(len(read_list)):
if contig==read_list[i][0][1:]:
temp=read_list[i][1]
return temp
///
}}}
{{{id=7|
name_list=['49315__BK','49320__AM','49320__JP','49321__JP','50793__BK','50847__AM','50870__AM','50870__BK','50870__JP','50874__BK','50874__JP','50876__JP','50877__AM','50877__BK','50878__AM','50879__AM','50879__BK','50879__JP','50887__BK','50889__AM','50899__JP','50900__BK','50912__AM','50912__JP','50917__JP','50925__BK','50927__AM','50929__JP','50947__BK','50949__AM','50960__JP','50961__BK','50967__AM','50978__BK','50982__JP','51899__BK','51902__JP','51906__AM','51915__JP','51930__JP','56871__JP']
///
}}}
{{{id=31|
print sample_list
///
['49315', '49320', '49321', '50793', '50847', '50870', '50874', '50876', '50877', '50878', '50879', '50887', '50889', '50899', '50900', '50912', '50917', '50925', '50927', '50929', '50947', '50949', '50960', '50961', '50967', '50978', '50982', '51899', '51902', '51906', '51915', '51930', '56871']
}}}
{{{id=20|
sample_list=[]
for name in name_list:
if name[:5] not in sample_list:
sample_list.append(name[:5])
///
}}}
{{{id=9|
sample_nd_dic={}
for name in name_list:
if name[:5] not in sample_nd_dic:
sample_nd_dic[name[:5]]=0
sample_nd_dic[name[:5]]+=1
///
}}}
{{{id=11|
sample_pos_dic={}
l=len(name_list)
temp='xxxxxx'
for i in range(l):
if name_list[i][:5]!=temp:
sample_pos_dic[name_list[i][:5]]=i
temp=name_list[i][:5]
///
}}}
{{{id=34|
name_nd_list=[]
for key in sample_pos_dic:
name_nd_list.append(name_list[sample_pos_dic[key]])
///
}}}
{{{id=36|
file=open('/Users/biocomplexity/Documents/plasmid/ST307/ST307.plasmidfinder_results.tsv','r')
reader=csv.reader(file, delimiter='\t')
matr=[]
for row in reader:
temp=[]
for i in range(len(row)):
temp.append(row[i])
matr.append(temp)
file.close()
///
}}}
{{{id=38|
inc_type_list=[]
for i in range(184):
if matr[i+1][2] not in inc_type_list:
inc_type_list.append(matr[i+1][2])
inc_type_dic={}
for i in range(18):
inc_type_dic[inc_type_list[i]]=i
///
}}}
{{{id=39|
contig_list=[]
for i in range(184):
if matr[i+1][5] not in contig_list:
contig_list.append(matr[i+1][5])
contig_dic={}
for i in range(110):
contig_dic[contig_list[i]]=i
///
}}}
{{{id=40|
indicator=[]
for i in range(110):
temp=[]
for j in range(18):
temp.append(0)
indicator.append(temp)
for row in matr[1:]:
indicator[contig_dic[row[5]]][inc_type_dic[row[2]]]=1
///
}}}
{{{id=57|
sum([1,2,3])
///
6
}}}
{{{id=56|
plasmid_list=[]
for i in range(110):
if sum(indicator[i])!=0:
plasmid_list.append(contig_list[i])
///
}}}
{{{id=58|
len(plasmid_list)
///
110
}}}
{{{id=52|
len(incX3[0][1:])
///
25
}}}
{{{id=53|
len(fibkX)
///
38
}}}
{{{id=63|
rank_X3 = 0
for obj in plasmid_total[0][1:]:
if obj>0.46739:
rank_X3+=1
///
}}}
{{{id=64|
rank_X3
///
27
}}}
{{{id=62|
print float(mean(fibkX[0][1:])),float(std(fibkX[0][1:]))
///
0.270300131396 0.0186597741643
}}}
{{{id=61|
print float(median(incX3[0][1:])),float(std(incX3[0][1:]))
///
0.467382436024 0.0442171451418
}}}
{{{id=60|
print float(median(plasmid_total[0][1:])),float(std(plasmid_total[0][1:]))
///
0.41113922943 inf
}}}
{{{id=54|
scipy.stats.mannwhitneyu(incX3[0][1:], plasmid_total[0][1:],alternative='two-sided')
///
MannwhitneyuResult(statistic=1823.5, pvalue=0.011038213170522658)
}}}
{{{id=59|
reads=[]
reads.append(chrom_read_list[0])
for contig in plasmid_list:
print contig
reads.append(get_read(contig))
for read in reads:
print len(read)
plasmid_total=pairwise_kmer_dis(reads,5)
///
WARNING: Output truncated!
full_output.txt
49315__5__BK|78|circular
49315__4__BK|78|circular
49315__2__BK|78|circular
49315__3__BK|78|circular
49320__3__AM|I8|circular
49320__2__AM|I8|circular
49320__3__JP|I8|circular
49320__2__JP|I8|circular
49321__3__JP|I13|circular
49321__2__JP|I13|circular
50793__4__BK|KO1|circular
50793__3__BK|KO1|circular
50793__2__BK|KO1|circular
50847__3__AM|KO90|circular
50847__2__AM|KO90|circular
50870__5__AM|107_ESBL|circular
50870__4__AM|107_ESBL|circular
50870__2__AM|107_ESBL|circular
50870__3__AM|107_ESBL|circular
50870__6__BK|107_ESBL|circular
50870__4__BK|107_ESBL|circular
50870__2__BK|107_ESBL|circular
50870__3__BK|107_ESBL|circular
50870__8__JP|107_ESBL|circular
50870__4__JP|107_ESBL|circular
50870__2__JP|107_ESBL|circular
50870__3__JP|107_ESBL|linear
50874__4__BK|38_ESBL|circular
50874__3__BK|38_ESBL|circular
50874__2__BK|38_ESBL|circular
50874__4__JP|38_ESBL|circular
50874__3__JP|38_ESBL|circular
50874__2__JP|38_ESBL|circular
50876__5__JP||circular
50876__3__JP||circular
50876__4__JP||circular
50876__2__JP||linear
50877__4__AM|A05|circular
50877__2__AM|A05|circular
50877__5__AM|A05|circular
50877__4__BK|A05|circular
50877__2__BK|A05|circular
50877__5__BK|A05|circular
50878__6__AM|A21|circular
50878__4__AM|A21|circular
50878__2__AM|A21|linear
50879__3__AM|C6|circular
50879__2__AM|C6|linear
50879__3__BK|C6|circular
50879__2__BK|C6|linear
50879__2__JP|C6|linear
50887__6__BK|E23|circular
50887__3__BK|E23|circular
50887__2__BK|E23|circular
50889__3__AM|E30|circular
50889__2__AM|E30|circular
50899__3__JP|F20|circular
50899__2__JP|F20|circular
50900__3__BK|F25|linear
...
337117
2859
51479
193137
51479
192922
51479
192922
207780
51479
51479
185398
51479
185656
51479
193137
51479
192922
150996
51479
194831
51479
195566
150995
3549
51479
193007
51479
188774
3549
51479
192792
3549
51479
265406
3549
51479
192792
1551
9293
89609
191494
1551
9293
89609
191470
1551
9293
243050
1551
9293
248812
96097
231486
51484
195438
51479
191087
170141
435025
}}}
{{{id=51|
reads=[]
reads.append(chrom_read_list[0])
for contig in get_contig('IncX3'):
print contig
reads.append(get_read(contig))
for read in reads:
print len(read)
incX3=pairwise_kmer_dis(reads,5)
///
49320__3__AM|I8|circular
49320__3__JP|I8|circular
49321__3__JP|I13|circular
50793__3__BK|KO1|circular
50847__3__AM|KO90|circular
50879__3__AM|C6|circular
50879__3__BK|C6|circular
50879__2__JP|C6|linear
50887__3__BK|E23|circular
50889__3__AM|E30|circular
50899__3__JP|F20|circular
50900__4__BK|F25|circular
50912__3__AM|G27|circular
50912__3__JP|G27|circular
50917__3__JP|G58|circular
50925__5__BK|H77|circular
50927__5__AM|L13|circular
50929__5__JP|L28|circular
50947__3__BK|I2|circular
50949__3__AM|I15|circular
50960__3__JP|K22|circular
50961__3__BK|K29|circular
50967__4__AM|K61|circular
51915__2__JP|NH25|linear
51930__5__JP|G13|circular
5309360
51307
51307
51477
51479
51479
95747
95747
337117
51479
51479
51479
51479
51479
51479
51479
51479
51479
51479
51479
51479
51479
51479
51479
51484
51479
}}}
{{{id=43|
reads=[]
reads.append(chrom_read_list[0])
for contig in get_contig('IncFIB(K)'):
print contig
reads.append(get_read(contig))
for read in reads:
print len(read)
fibkX=pairwise_kmer_dis(reads,5)
///
49315__2__BK|78|circular
49320__2__AM|I8|circular
49320__2__JP|I8|circular
49321__2__JP|I13|circular
50793__2__BK|KO1|circular
50847__2__AM|KO90|circular
50870__2__AM|107_ESBL|circular
50870__2__BK|107_ESBL|circular
50870__2__JP|107_ESBL|circular
50874__2__BK|38_ESBL|circular
50874__2__JP|38_ESBL|circular
50876__2__JP||linear
50877__2__AM|A05|circular
50877__2__BK|A05|circular
50878__2__AM|A21|linear
50887__2__BK|E23|circular
50889__2__AM|E30|circular
50899__2__JP|F20|circular
50900__3__BK|F25|linear
50912__2__AM|G27|circular
50912__2__JP|G27|circular
50917__2__JP|G58|circular
50925__2__BK|H77|circular
50927__3__AM|L13|circular
50929__2__JP|L28|circular
50947__2__BK|I2|circular
50949__2__AM|I15|circular
50960__2__JP|K22|circular
50967__2__AM|K61|circular
50978__2__BK||circular
50982__2__JP|KV5|circular
51899__2__BK|IM18|circular
51902__2__JP|IM43|linear
51906__3__AM|IM68|circular
51915__3__JP|NH25|linear
51930__2__JP|G13|circular
56871__2__JP||circular
5309360
242625
206072
193137
184067
192792
192922
171075
171333
170776
243741
243692
279752
234446
221646
176630
193137
192922
192922
207780
185398
185656
193137
192922
194831
195566
193007
188774
192792
192792
191494
191470
243050
248812
231486
195438
191087
435025
}}}
{{{id=65|
for inc_name in inc_type_list:
print inc_name
reads=[]
reads.append(chrom_read_list[0])
for contig in get_contig(inc_name):
reads.append(get_read(contig))
print len(reads)-1
testabc=pairwise_kmer_dis(reads,5)
print float(median(testabc[0][1:]))
///
Col440I
7
0.612474055765
IncFIA(HI1)
5
0.317374731223
IncFIB(K)
37
0.266010412234
IncFIB(Mar)
8
0.495657507327
IncFII(K)
36
0.266002975077
IncR
5
0.317374731223
ColKP3
25
0.467304728399
IncX3
25
0.467382436024
Col(MG828)
10
0.776165880531
ColRNAI
11
0.41135453786
IncA/C2
1
0.397385179161
IncFII
4
0.267132662441
IncL/M(pOXA-48)
2
0.320168652211
IncFIB(pQil)
3
0.267090403177
IncHI1B
1
0.454951245412
FII(pBK30683)
2
0.239491605923
IncHI2
1
0.365804254081
IncHI2A
1
0.365804254081
}}}
{{{id=47|
reads=[]
reads.append(chrom_read_list[0])
for contig in get_contig('IncFII'):
print contig
reads.append(get_read(contig))
for read in reads:
print len(read)
FII=pairwise_kmer_dis(reads,5)
///
50876__2__JP||linear
50925__3__BK|H77|circular
50929__3__JP|L28|circular
51930__3__JP|G13|circular
5309360
279752
150996
150995
170141
}}}
{{{id=70|
pairwise_jacc_dis(chrom_read_list,7)
///
WARNING: Output truncated!
full_output.txt
[[0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1/4096,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1/8192,
0,
0,
0],
[0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1/4096,
0,
0,
0,
0,
0,
0,
0,
...
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1/4096,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1/8192,
0,
0,
0],
[0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1/4096,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1/8192,
0,
0,
0]]
}}}
{{{id=45|
pairwise_kmer_dis(chrom_read_list,1)
///
}}}
{{{id=22|
chrom_read_list=[]
for obj in sample_list:
fname=name_list[sample_pos_dic[obj]]
temp=parse_fasta(fname)
pos=-1
max_l=-1
for i in range(len(temp)):
if max_l<=len(temp[i][1]):
pos=i
max_l=len(temp[i][1])
chrom_read_list.append(temp[pos][1])
print fname,max_l
sample_summary(fname,5)
///
49315__BK 5309360
49320__AM 5433034
49321__JP 5490534
50793__BK 5429493
50847__AM 5433096
50870__AM 5320351
50874__BK 5305405
50876__JP 5461567
50877__AM 5369795
50878__AM 5322273
50879__AM 5431353
50887__BK 5426175
50889__AM 5430491
50899__JP 5426166
50900__BK 4891344
50912__AM 5367784
50917__JP 5427075
50925__BK 5441562
50927__AM 2892181
50929__JP 5438229
50947__BK 5429185
50949__AM 5410436
50960__JP 5423214
50961__BK 5430257
50967__AM 5282757
50978__BK 5356392
50982__JP 5310807
51899__BK 5303493
51902__JP 5298514
51906__AM 4306614
51915__JP 5434844
51930__JP 5430045
56871__JP 5215702
}}}
{{{id=30|
len(chrom_read_list)
///
33
}}}
{{{id=27|
fre_kmers(chrom_read_list[0],1)
///
{'A': 564023/2654680,
'C': 1527531/5309360,
'G': 762369/2654680,
'T': 225809/1061872}
}}}
{{{id=26|
len(chrom_read_list[0])
///
5309360
}}}
{{{id=23|
len(chrom_read_list)
///
33
}}}
{{{id=3|
temp=parse_fasta('51906__AM')
///
}}}
{{{id=6|
for obj in temp:
print obj[0],len(obj[1])
///
>51906__1__AM|IM68|linear 4306614
>51906__2__AM|IM68|linear 711939
>51906__3__AM|IM68|circular 231486
>51906__4__AM|IM68|linear 231219
>51906__5__AM|IM68|linear 96097
>51906__6__AM|IM68|linear 12840
>51906__7__AM|IM68|linear 4075
}}}
{{{id=4|
sample_summary('51906__AM',5)
///
Traceback (most recent call last):
File "", line 1, in
File "_sage_input_151.py", line 10, in
exec compile(u'open("___code___.py","w").write("# -*- coding: utf-8 -*-\\n" + _support_.preparse_worksheet_cell(base64.b64decode("c2FtcGxlX3N1bW1hcnkoJzUxOTA2X19BTScsNSk="),globals())+"\\n"); execfile(os.path.abspath("___code___.py"))' + '\n', '', 'single')
File "", line 1, in
File "/private/var/folders/jh/vt78zj4d6ms09lxnw69g8jq00000gn/T/tmpS87UeL/___code___.py", line 3, in
exec compile(u"sample_summary('51906__AM',_sage_const_5 )" + '\n', '', 'single')
File "", line 1, in
File "/private/var/folders/jh/vt78zj4d6ms09lxnw69g8jq00000gn/T/tmpcNt5sV/___code___.py", line 8, in sample_summary
pairwise_kmer_dis(readlist,k)
File "/private/var/folders/jh/vt78zj4d6ms09lxnw69g8jq00000gn/T/tmpnjSpjG/___code___.py", line 13, in pairwise_kmer_dis
matrix_plot(matrix(dis_mat), colorbar=True).show()
TypeError: 'list' object is not callable
}}}
{{{id=5|
///
}}}