-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimu_RGC_microbiome.py
executable file
·68 lines (49 loc) · 2 KB
/
simu_RGC_microbiome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
# require wgsim
import os
import numpy as np
import random
def get_abundance(num):
# Set the parameters of the log-normal distribution
mu = 0.0 # Mean of the logarithm of the distribution
sigma = 1.0 # Standard deviation of the logarithm of the distribution
# Generate random numbers from the log-normal distribution
abundance = np.random.lognormal(mean=mu, sigma=sigma, size=1000)
new_abun = abundance[:num]
abundance = np.array(new_abun)/sum(new_abun)
# print (abundance)
return abundance
def get_sample(ID, bin_num):
abundance = get_abundance(bin_num)
random.shuffle(bins_list)
select_bins = bins_list[:bin_num]
sample_dir = outdir + "/" + ID
os.system("mkdir %s"%(sample_dir))
truth_file = sample_dir + "/" + ID + ".truth.csv"
f = open(truth_file, 'w')
for i in range(bin_num):
the_bin = select_bins[i]
bin_name = the_bin.split("/")[-1]
abun = abundance[i]
read_pair_num = round(read_num * abun)
print (bin_name, read_pair_num, abun, file = f)
os.system(f"wgsim -S 6 -e 0 -1 150 -2 150 -r 0 -N {read_pair_num} {the_bin} {sample_dir}/{ID}_tem{i}.read1.fq {sample_dir}/{ID}_tem{i}.read2.fq")
os.system(f"cat {sample_dir}/{ID}_tem*.read1.fq > {sample_dir}/{ID}.read1.fq")
os.system(f"cat {sample_dir}/{ID}_tem*.read2.fq > {sample_dir}/{ID}.read2.fq")
# os.system(f"rm {sample_dir}/{ID}_tem*.read*.fq")
os.system(f"gzip -f {sample_dir}/{ID}.read*.fq")
if __name__ == "__main__":
## hyper parameters
base_num = 10000000000 ## 10G base
# base_num = 100000 ## 10G base
bin_num = 10
sample_num = 50
outdir = "data/"
bin_file_list = "bins.list" # each line is a path of a bin's fasta file
read_num = round(base_num/(150*2)) ###150bp pair
bins_list = []
for line in open(bin_file_list):
bins_list.append(line.strip())
for i in range(sample_num):
ID = f"rgc_meta_{i}"
get_sample(ID, bin_num)