-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_main.py
218 lines (197 loc) · 6.8 KB
/
run_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
chrom = callset['variants/CHROM']
# get specific chromosomes
chrom[np.where(chrom[:] == "chr")]
chrom[:] # loads into numpy array from hf5 object
pos = callset['variants/POS']
# fast with Dask
gt = allel.GenotypeDaskArray(callset['calldata/GT'])
ac = gt.count_alleles(max_allele=2).compute
# gt = allel.GenotypeArray(callset['calldata/GT'])
# gt[variants, samples], gt[1:3, :] 2-4th variant all samples
# gt[:, 0:2], all variants, 1st and 2nd samples
gt.compress(mask)
gt.take(actual_indexes)
gt.subset()
@author: scott
"""
from __future__ import division
from __future__ import print_function
from imp import reload
import allel
import argparse
import numpy as np
import pandas as pd
import pyfasta
# functions
import astat
from allel_class import Chr
import apca as apca
import autil as autil
import afst as afst
import adxy as adxy
import adiff as ad
from af234 import pF2
import asfs as asfs
import adiv as av
import ald as ald
parser = argparse.ArgumentParser()
parser.add_argument('-fa', '--fasta', help='path to fasta genome')
parser.add_argument('-gff3', '--ggf3feature', help="path to gff3")
parser.add_arguement('-v', "--vcf", help="path to vcf")
parser.add_argument('--h5', action="store_true", help="h5 exists")
parser.add_arguement('-m', "--meta", required=True, help="path to meta data")
parser.add_arguement('-s', "--samples", type=int, help="number samples")
parser.add_arguement('-a', "--altnum", type=int, default=1,
help="number alt alleles")
args = parser.parse_args()
def makeh5fromvcf(vcfin, altnum, hf5):
"""
"""
h5out = "{}.h5".format(vcfin.split(".")[:-2])
if hf5:
pass
else:
fieldsfromvcf = ['samples', 'calldata/GQ', 'variants/ALT',
'variants/REF', 'variants/QUAL', 'variants/CHROM',
'variants/POS', 'variants/AF', 'variants/AB',
'variants/MQM', 'variants/DP', 'calldata/DP',
'calldata/AD', 'calldata/GT']
allel.vcf_to_hdf5(vcfin, h5out, fields=fieldsfromvcf,
types={'calldata/GQ': 'float32'}, alt_number=2)
# callset = h5py.File(h5out, mode='r')
return(None)
def loadgenome_extradata_fx(fasta_handle, gff3_handle, meta):
"""
"""
genome = pyfasta.Fasta(fasta_handle, key_fn=lambda key: key.split()[0])
gff3 = allel.FeatureTable.from_gff3(gff3_handle)
meta = pd.read_csv(meta, delimiter=",")
return(genome, gff3, meta)
if __name__ == '__main__':
# load data args
fasta_handle = args.fasta
gff3_handle = args.gff3
meta = args.meta
vcfin = args.vcf
samp = args.samples
alleles = args.altnum
# start funcs
makeh5fromvcf(vcfin, alleles, args.h5)
genome, gff3, meta = loadgenome_extradata_fx(fasta_handle, gff3_handle,
meta)
meta = "AnopSG.55.info"
meta = pd.read_csv(meta, delimiter=",")
var = Chr('All', '2L.FSG.SNP.recode.h5')
popdict = autil.subpops(var, meta, bypop=True, bykary=False)
pop2color = autil.popcols(popdict)
chrlist = np.unique(var.chrm[:])
chrlen = {}
with open("chr_info", 'r') as c:
for line in c:
x = line.strip().split()
chrlen[x[0]] = int(x[1])
# number of samples and list of chromosomes
n_samples = len(var.calls['samples'])
print("number of samples: {}".format(n_samples))
print("list of loaded chromosomes: {}".format(chrlist))
# Sample GT stats
miss = astat.gtstats(var.calls, pop2color, var.chrm.shape[0])
# Chr stats
astat.chrstats(chrlist, var.calls, miss, n_samples)
# Population Stats
chrstatdict, diff, priv = astat.popstats(chrlist, meta, popdict, var)
# PCA and LD thin
thinpos = {}
pcadict = {}
gnudict = {}
for c in chrlist:
# LD thin
print(c)
var.geno(c, meta)
var.miss(var.gt, var.pos, 0) # use only sites without missing data
gn, thinp = autil.ldthin(var.gt, var.pos, "thin", iters=5)
gnudict[c] = gn
thinpos[c] = thinp
for c in gnudict.keys():
# PCA
gn = gnudict[c]
coords, model = apca.pca_fx(gn, meta, c, pop2color, False, var.pop,
bykary=True)
pcadict[c] = (coords, model)
# ADMIXTURE input files
autil.vcf2plink(thinpos) # make input files
# TODO: cut partial windows
# FST, Fstatistics, doubletons, jsfs
fstdict = {}
dxydict = {}
f2dict = {}
doubdict = {}
jsfsdict = {}
for c in chrlist:
var.geno(c, meta)
# var.miss(var.gt, var.pos, .20)
# var.mac(var.gt, var.pos, 1)
print("\nStats for Chromosome {}\n".format(c))
# allele count object
ac_subpops = var.gt.count_alleles_subpops(popdict, max_allele=2)
# TODO: error with Wb
# df_FST = afst.pairFST(c, chrlen[c], ac_subpops, var, popdict,
# plot=True)
#fstdict[c] = df_FST
df_dxy = adxy.pairDxy(c, chrlen[c], ac_subpops, var.pos, plot=True)
dxydict[c] = df_dxy
# jsfsdict = ad.jsfs(ac_subpops, save=False)
# dshare = ad.shared_doubletons(ac_subpops)
# doubdict[c] = dshare
# TODO: fix f2
# f2 = pF2(ac_subpops)
# f2dict[c] = f2
# Diversity statistics
sfsdict = {}
pidict = {}
tajddict = {}
thetadict = {}
for c in chrlist:
var.geno(c, meta)
print("\nStats for Chromosome {}\n".format(c))
# var.miss(var.gt, var.pos, .20)
# var.mac(var.gt, var.pos, 1)
# allele count object
ac_subpops = var.gt.count_alleles_subpops(popdict, max_allele=1)
#sfs = asfs.sfs_plot(c, ac_subpops)
#sfsdict[c] = sfs
pi = av.pi(c, chrlen[c], ac_subpops, var.pos, plot=True)
pidict[c] = pi
d = av.tajd(c, chrlen[c], ac_subpops, var.pos, plot=True)
tajddict[c] = d
t = av.theta(c, chrlen[c], ac_subpops, var.pos, plot=True)
thetadict[c] = t
# diversity box plot: sumdict() autil then boxplot in aplot
# LD decay plot
lddict = {}
for c in chrlist:
var.geno(c, meta)
print("\nStats for Chromosome {}\n".format(c))
var.miss(var.gt, var.pos, .20)
# allele count object
ac_subpops = var.gt.count_alleles_subpops(popdict, max_allele=1)
lddict[c] = ald.ld_decay(c, chrlen[c], ac_subpops, popdict,
pop2color, var)
import matplotlib.pyplot as plt
# tajd histogram
x = []
for p in tajddict.keys():
x.append((tajddict[p]["Fun"][2][0]))
m = np.concatenate(x).ravel()
n = m[~np.isnan(m)]
b,bins,patches = plt.hist(n, 50, density=True)
# pi histogram
x = []
for p in pidict.keys():
x.append((pidict[p]["Par"][2][0]))
m = np.concatenate(x).ravel()
n = m[~np.isnan(m)]
b, bins, patches = plt.hist(n, 50, density=True)