DATA SCIENCE THE LINE - CHEATSHEET

9 ENVIRONMENTS, HELP, MACROS 16 25 - display cmndline tool class $ help alias $ help type $ alias ll=’ -alF’ $ type bash - Bourne shell sudo - exec cmnd as another user $ sudo apt-get bash $ sudo apt-get install sudo $ man bash $ man sudo bc - evaluate equation from stdin $ sudo apt-get install bc FILES & DIRECTORIES $ man bc

$ 'e(1)' | bc -l body - apply expression to all but 1st line 2.71828182845904523536 $ git clone

$ echo -e "value\n7\n2\n5\n3" | body -n cols - apply cmnd to subset of cols, merge result value $ git clone 2 $ < iris.csv cols -C species body tapkee --method pca | ​ ​ 3 header -r x,y,species 5

7 cowsay - debugging helper

$ sudo apt-get install cowsay cd - change working directory $ man cowsay $ help cd $ echo 'The command line is awesome!' | cowsay $ cd ~; ; cd ..; pwd export - set export attribute for shell variables - concat files & stdin, print to stdout $ help export $ sudo apt-get install coreutils $ export WEKAPATH=$HOME/bin $ man cat

$ cat results-01 results-02 results-03 > results-all for - exec command for each member of list

$ help for - change mode bits $ for i in {A..C} "It's easy as" {1..3}; do echo $i; done $ sudo apt-get install coreutils A $ man chmod B $ chmod u+x experiment.sh C

It's easy as - copy files & directories 1 $ sudo apt-get install coreutils 2 $ man cp 3

- remove sections from each line of files man - read reference manuals of cmndline tools $ sudo apt-get install coreutils $ sudo apt-get install man $ man cut $ man man $ man echo - display line of text

$ sudo apt-get install coreutils pbc - run bc with parallel $ man echo $ git clone

$ 5 | pbc '{1}^2' - run program in modified environment 1 $ sudo apt-get install coreutils 4 DATA SCIENCE AT THE COMMAND LINE - CHEATSHEET

$ man env - split file into pieces $ #!/usr/bin/env python $ sudo apt-get install coreutils $ man split fieldsplit - split file in multiples based on field value $ # See website for installation instructions - output last part of files $ fieldsplit --help $ sudo apt-get install coreutils $ man tail - file search in directory $ seq 5 | tail -n 3 $ sudo apt-get install findutils 3 $ man find 4 5 - output first n lines of files $ sudo apt-get install coreutils - read from stdin, to stdout and files $ man head $ sudo apt-get install coreutils $ seq 5 | head -n 3 $ man tee 1 2 - translate or delete characters 3 $ sudo apt-get install coreutils $ man tr header - add / replace / delete header lines $ git clone - newline, word & byte counts for each file $ header -h $ sudo apt-get install coreutils $ man wc less - paginate large files $ echo 'hello world' | wc -c $ sudo apt-get install less 12 $ man less $ csvlook iris.csv | less PATTERN MATCHING ls - list directory contents $ sudo apt-get install coreutils -- pattern scanning & text processing $ man ls $ sudo apt-get install mawk $ man awk - move / rename files & directories $ seq 5 | awk '{+=$1} END {print sum}' $ sudo apt-get install coreutils 15 $ man mv - filter & transform text - merge lines of files $ sudo apt-get install sed $ sudo apt-get install coreutils $ man sed $ man paste grep - print lines matching pattern pwd - print working directory name $ sudo apt-get install grep $ man pwd $ man grep $ pwd /home/vagrant DEPLOYMENT - remove files & directories $ sudo apt-get install coreutils aws -- manage AWS services $ man rm $ sudo pip install awscli $ aws help sort - sort lines of text files $ aws ec2 describe-regions | head -n 5 $ sudo apt-get install coreutils { "Regions": [ { $ man sort "Endpoint": "ec2.eu-west-1.amazonaws.com", "RegionName": "eu-west-1" DATA SCIENCE AT THE COMMAND LINE - CHEATSHEET

JSON FILES git - manage Git repositories

$ sudo apt-get install git jq - JSON processor $ man git $ man jq

CSV FILES xml2json - XML to JSON $ npm install xml2json-command $ xml2json < input.xml > output.json csvcut - extract columns from CSV

$ sudo pip install csvkit

$ csvcut --help LOGIN, DOWNLOAD, SCRAPE csvgrep - filter CSV where cols=arg or regexp $ sudo pip install csvkit curl - download data from URL $ csvgrep --help $ sudo apt-get install curl

$ man curl csvjoin - merge 2+ CSV tables aka SQL

$ sudo pip install csvkit culique - perform OAuth for curl $ csvjoin --help $ git clone https://github.com/decklin/curlicue.git

csvlook - render CSV to readable stdout scp - copy remote files securely $ sudo pip install csvkit $ sudo apt-get install openssh-client $ csvlook --help $ man scp $ echo -e "a,b\n1,2\n3,4" | csvlook

scrape - scrape HTML with XPath or CSS3 selector csvsort - sort CSV $ git clone $ sudo pip install csvkit $ curl -sL '' | scrape -e 'head > title' $ csvsort --help Data Science Toolbox

csvsql - execute SQL queries on CSV ssh - login to remote machines $ sudo pip install csvkit $ sudo apt-get install ssh $ csvsql --help $ man ssh

csvstack - stack rows from multiple CSVs $ sudo pip install csvkit DISPLAYS $ csvstack --help csvstat - descriptive stats for all cols in CSV display - display image data, any X server $ sudo pip install csvkit $ sudo apt-get install imagemagick $ csvstat --help $ man display in2csv - convert data formats to CSV feedgnuplot - generate gnuplot script $ sudo pip install csvkit $ sudo apt-get install feedgnuplot $ in2csv --help $ man feedgnuplot json2csv - JSON to CSV $ go get github.com/jehiah/json2csv WORKFLOWS $ json2csv --help drake - manage workflow sql2csv - exec cmnds vs SQL DB, return CSV data $ # Please see Chapter 6 for installation instructions. $ sudo pip install csvkit $ drake --help $ sql2csv --help parallel - run shell cmnd lines from stdin in parallel DATA SCIENCE AT THE COMMAND LINE - CHEATSHEET

$ # See website for installation instructions Rio-scatter - scatter plot from CSV using Rio $ man parallel $ git clone $ seq 3 | parallel echo Processing file {}.csv $ < iris.csv Rio-scatter sepal_length sepal_width species Processing file 1.csv > iris.png Processing file 2.csv Processing file 3.csv EXTERNAL TOOL APIS

INTEGER / DATE SEQUENCES bigmler - prediction API

$ sudo pip install bigmler dseq - generate date sequence rel to today $ bigmler --help $ git clone $ dseq -2 0 # day before yesterday till today run_experiment - run ML trial with Scikit-Learn 2014-07-15 $ sudo pip install skll 2014-07-16 $ run_experiment --help 2014-07-17

tapkee - dimensionality reduction API seq - print sequence of numbers $ # See website for installation instructions $ sudo apt-get install coreutils $ tapkee --help $ man seq $ < iris.csv cols -C species body tapkee --method pca | $ seq 3 header -r x,y,species 1

2 weka - Weka API command line tool 3 $ git clone sample - print from stdout (prob, duration, delay) $ git clone FILE EXTRACTION / COMPRESSION $ sample --help tar - create, list, extract TAR archives shuf - generate random permutations $ sudo apt-get install tar $ sudo apt-get install coreutils $ man tar $ man shuf tree - list directory contents, tree format PYTHON, R $ sudo apt-get install tree $ man tree

pip - manage Python packages - report or omit repeated lines $ sudo apt-get install python-pip $ sudo apt-get install coreutils $ man pip $ man uniq

python - exec Python language unpack - extract common file formats $ sudo apt-get install python $ git clone $ man python $ unpack file.tgz

R - exec R language unrar - extract files from RAR archives $ sudo apt-get install r-base-dev $ sudo apt-get install unrar-free $ man R $ man unrar

Rio - load CSV from stdin, run R script, get output unzip - list, , extract compressed ZIP files $ git clone $ sudo apt-get install unzip $ Rio -h $ man unzip $ seq 10 | Rio -nf sum

55