$ axel ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt
$ cat assembly_summary.txt | sed 1d | sed '1s/^# //' \
| sed 's/"/$/g' > assembly_summary.tsv
$ csvtk stats -t assembly_summary.tsv
file num_cols num_rows
assembly_summary.tsv 22 83,308
# aligned view
$ csvtk -t pretty assembly_summary.tsv > assembly_summary.tsv.pretty
$ csvtk headers -t assembly_summary.tsv
# assembly_summary.tsv
1 assembly_accession
2 bioproject
3 biosample
4 wgs_master
5 refseq_category
6 taxid
7 species_taxid
8 organism_name
9 infraspecific_name
10 isolate
11 version_status
12 assembly_level
13 release_type
14 genome_rep
15 seq_rel_date
16 asm_name
17 submitter
18 gbrs_paired_asm
19 paired_asm_comp
20 ftp_path
21 excluded_from_refseq
22 relation_to_type_material
$ cat assembly_summary.tsv \
| csvtk grep -t -f assembly_level -i -p "Complete Genome" \
| wc -l
7742
Counting by species_taxid.
$ cat assembly_summary.tsv \
| csvtk cut -t -f species_taxid \
| csvtk freq -t \
| sed 1d | wc -l
12445
Counting by organism_name (not precise)
$ cat assembly_summary.tsv \
| csvtk cut -t -f organism_name \
| cut -d ' ' -f 1,2 \
| csvtk freq -t \
| sed 1d | wc -l
7086
$ cat assembly_summary.tsv \
| csvtk cut -t -f organism_name \
| cut -d ' ' -f 1,2 \
| csvtk freq -t -n -r | head -n 20 | csvtk pretty -t
organism_name frequency
Staphylococcus aureus 7441
Streptococcus pneumoniae 7257
Salmonella enterica 6903
Escherichia coli 5385
Mycobacterium tuberculosis 5088
Pseudomonas aeruginosa 2230
Acinetobacter baumannii 1947
Klebsiella pneumoniae 1798
Mycobacterium abscessus 1375
Listeria monocytogenes 1324
Shigella sonnei 958
Streptococcus suis 955
Clostridioides difficile 901
Campylobacter jejuni 899
Streptococcus agalactiae 867
Campylobacter coli 802
Neisseria meningitidis 790
Vibrio parahaemolyticus 685
Helicobacter pylori 659
$ cat assembly_summary.tsv \
| csvtk grep -t -f assembly_level -i -p "Complete Genome" \
| csvtk cut -t -f organism_name \
| cut -d ' ' -f 1,2 \
| csvtk freq -t -n -r | head -n 20 | csvtk pretty -t
organism_name frequency
Escherichia coli 306
Bordetella pertussis 291
Salmonella enterica 260
Staphylococcus aureus 145
Campylobacter jejuni 113
Klebsiella pneumoniae 108
Listeria monocytogenes 95
Helicobacter pylori 85
Pseudomonas aeruginosa 80
Neisseria meningitidis 76
Chlamydia trachomatis 68
Legionella pneumophila 62
Acinetobacter baumannii 59
Burkholderia pseudomallei 59
Corynebacterium pseudotuberculosis 59
Mycobacterium tuberculosis 52
Bacillus subtilis 50
Streptococcus pyogenes 50
Bacillus anthracis 43
$ cat assembly_summary.tsv \
| csvtk grep -t -f organism_name -i -r -p "Mycobacterium tuberculosis" \
| csvtk grep -t -f assembly_level -i -p "Complete Genome" \
> mt.tsv
Given one or few taxids
$ cat assembly_summary.tsv \
| csvtk grep -t -f species_taxid -p 239935,1280 \
| csvtk grep -t -f assembly_level -i -p "Complete Genome" \
> bytaxid.tsv
Given taxid list file (one taxid per line)
$ cat assembly_summary.tsv \
| csvtk grep -t -f species_taxid -P taxid_list.txt \
| csvtk grep -t -f assembly_level -i -p "Complete Genome" \
> bytaxid.tsv
$ cat mt.tsv | csvtk cut -t -f ftp_path | sed 1d \
| rush -v prefix='{}/{%}' \
' \
wget -c {prefix}_genomic.fna.gz; \
wget -c {prefix}_genomic.gbff.gz; \
wget -c {prefix}_genomic.gff.gz; \
wget -c {prefix}_cds_from_genomic.fna.gz \
wget -c {prefix}_protein.faa.gz; \
' \
-j 10 -c -C download.rush