DATA SCIENCE AT THE COMMAND LINE - CHEATSHEET
9 ENVIRONMENTS, HELP, MACROS 16 25 alias type - display cmndline tool class $ help alias $ help type $ alias ll=’ls -alF’ $ type cd bash - Bourne shell sudo - exec cmnd as another user $ sudo apt-get install bash $ sudo apt-get install sudo $ man bash $ man sudo bc - evaluate equation from stdin $ sudo apt-get install bc FILES & DIRECTORIES $ man bc
$ echo 'e(1)' | bc -l body - apply expression to all but 1st line 2.71828182845904523536 $ git clone
$ echo -e "value\n7\n2\n5\n3" | body sort -n cols - apply cmnd to subset of cols, merge result value $ git clone
7 cowsay - debugging helper
$ sudo apt-get install cowsay cd - change working directory $ man cowsay $ help cd $ echo 'The command line is awesome!' | cowsay $ cd ~; pwd; cd ..; pwd export - set export attribute for shell variables cat - concat files & stdin, print to stdout $ help export $ sudo apt-get install coreutils $ export WEKAPATH=$HOME/bin $ man cat
$ cat results-01 results-02 results-03 > results-all for - exec command for each member of list
$ help for chmod - change file mode bits $ for i in {A..C} "It's easy as" {1..3}; do echo $i; done $ sudo apt-get install coreutils A $ man chmod B $ chmod u+x experiment.sh C
It's easy as cp - copy files & directories 1 $ sudo apt-get install coreutils 2 $ man cp 3
cut - remove sections from each line of files man - read reference manuals of cmndline tools $ sudo apt-get install coreutils $ sudo apt-get install man $ man cut $ man man $ man grep echo - display line of text
$ sudo apt-get install coreutils pbc - run bc with parallel $ man echo $ git clone
$ seq 5 | pbc '{1}^2' env - run program in modified environment 1 $ sudo apt-get install coreutils 4 DATA SCIENCE AT THE COMMAND LINE - CHEATSHEET
$ man env split - split file into pieces $ #!/usr/bin/env python $ sudo apt-get install coreutils $ man split fieldsplit - split file in multiples based on field value $ # See website for installation instructions tail - output last part of files $ fieldsplit --help $ sudo apt-get install coreutils $ man tail find - file search in directory $ seq 5 | tail -n 3 $ sudo apt-get install findutils 3 $ man find 4 5 head - output first n lines of files $ sudo apt-get install coreutils tee - read from stdin, write to stdout and files $ man head $ sudo apt-get install coreutils $ seq 5 | head -n 3 $ man tee 1 2 tr - translate or delete characters 3 $ sudo apt-get install coreutils $ man tr header - add / replace / delete header lines $ git clone
JSON FILES git - manage Git repositories
$ sudo apt-get install git jq - JSON processor $ man git $ man jq
CSV FILES xml2json - XML to JSON $ npm install xml2json-command $ xml2json < input.xml > output.json csvcut - extract columns from CSV
$ sudo pip install csvkit
$ csvcut --help LOGIN, DOWNLOAD, SCRAPE csvgrep - filter CSV where cols=arg or regexp $ sudo pip install csvkit curl - download data from URL $ csvgrep --help $ sudo apt-get install curl
$ man curl csvjoin - merge 2+ CSV tables aka SQL JOIN
$ sudo pip install csvkit culique - perform OAuth for curl $ csvjoin --help $ git clone https://github.com/decklin/curlicue.git
csvlook - render CSV to readable stdout scp - copy remote files securely $ sudo pip install csvkit $ sudo apt-get install openssh-client $ csvlook --help $ man scp $ echo -e "a,b\n1,2\n3,4" | csvlook
scrape - scrape HTML with XPath or CSS3 selector csvsort - sort CSV $ git clone
csvsql - execute SQL queries on CSV ssh - login to remote machines $ sudo pip install csvkit $ sudo apt-get install ssh $ csvsql --help $ man ssh
csvstack - stack rows from multiple CSVs $ sudo pip install csvkit DISPLAYS $ csvstack --help csvstat - descriptive stats for all cols in CSV display - display image data, any X server $ sudo pip install csvkit $ sudo apt-get install imagemagick $ csvstat --help $ man display in2csv - convert data formats to CSV feedgnuplot - generate gnuplot script $ sudo pip install csvkit $ sudo apt-get install feedgnuplot $ in2csv --help $ man feedgnuplot json2csv - JSON to CSV $ go get github.com/jehiah/json2csv WORKFLOWS $ json2csv --help drake - manage workflow sql2csv - exec cmnds vs SQL DB, return CSV data $ # Please see Chapter 6 for installation instructions. $ sudo pip install csvkit $ drake --help $ sql2csv --help parallel - run shell cmnd lines from stdin in parallel DATA SCIENCE AT THE COMMAND LINE - CHEATSHEET
$ # See website for installation instructions Rio-scatter - scatter plot from CSV using Rio $ man parallel $ git clone
INTEGER / DATE SEQUENCES bigmler - prediction API
$ sudo pip install bigmler dseq - generate date sequence rel to today $ bigmler --help $ git clone
tapkee - dimensionality reduction API seq - print sequence of numbers $ # See website for installation instructions $ sudo apt-get install coreutils $ tapkee --help $ man seq $ < iris.csv cols -C species body tapkee --method pca | $ seq 3 header -r x,y,species 1
2 weka - Weka API command line tool 3 $ git clone
pip - manage Python packages uniq - report or omit repeated lines $ sudo apt-get install python-pip $ sudo apt-get install coreutils $ man pip $ man uniq
python - exec Python language unpack - extract common file formats $ sudo apt-get install python $ git clone
R - exec R language unrar - extract files from RAR archives $ sudo apt-get install r-base-dev $ sudo apt-get install unrar-free $ man R $ man unrar
Rio - load CSV from stdin, run R script, get output unzip - list, test, extract compressed ZIP files $ git clone
55