curl -L https://raw.githubusercontent.com/luvres/hadoop/master/zoneCluster.sh -o ~/zoneCluster.sh
alias zoneCluster="bash ~/zoneCluster.sh"
mkdir $HOME/notebooks
zoneCluster
http://localhost:8088
http://localhost:50070
http://localhost:60010
http://localhost:8888/terminals/1
sh-4.2# bash <enter>
zoneCluster 3
docker logs -f Hadoop
Note: The script is limited to a maximum of 9 nodes because multiple hosts are being created on only one host and I see no point in overloading your machine. The settings are ready for a real cluster and in the future I want to create scripts for provisioning with docker swarm.
zoneCluster Stop
zoneCluster 2 -db
http://localhost:8888/terminals/1
# bash <Enter>
sqoop import \
--connect jdbc:mysql://mariadb:3306/mysql \
--username root \
--password maria \
--table user -m 1
hdfs dfs -ls -R user
docker exec -ti OracleXE bash
cd $HOME/data/
curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip
unzip ml-20m.zip
cd ml-20m
cat ratings.csv |tail -n $((`cat ratings.csv | wc -l` /100)) >ml_ratings.csv
sqlplus sys/oracle as sysdba
SQL> create user aluno identified by dsacademy;
SQL> grant connect, resource, unlimited tablespace to aluno;
SQL> conn aluno@xe/dsacademy
SQL> select user from dual;
SQL> CREATE TABLE cinema (
ID NUMBER PRIMARY KEY,
USER_ID VARCHAR2(30),
MOVIE_ID VARCHAR2(30),
RATING DECIMAL(30),
TIMESTAMP VARCHAR2(256)
);
SQL> desc cinema;
SQL> quit
tee $HOME/data/loader.dat <<EOF
load data
INFILE '$HOME/data/ml-20m/ml_ratings.csv'
INTO TABLE cinema
APPEND
FIELDS TERMINATED BY ','
trailing nullcols
(id SEQUENCE (MAX,1),
user_id CHAR(30),
movie_id CHAR(30),
rating decimal external,
timestamp char(256))
EOF
sqlldr userid=aluno/dsacademy control=$HOME/data/loader.dat log=$HOME/data/loader.log
sqlplus aluno/dsacademy
SQL> select count(*) from cinema;
http://localhost:8888/terminals/1
# bash <Enter>
sqoop import \
--connect jdbc:oracle:thin:@oraclexe:1521:XE \
--username aluno \
--password dsacademy \
--query "select user_id, movie_id from cinema where rating = 1 and \$CONDITIONS" \
--target-dir /user/oracle/output -m 1
curl -O https://raw.githubusercontent.com/luvres/hadoop/master/datasets/empregados.csv
hdfs dfs -mkdir /hive
hdfs dfs -copyFromLocal empregados.csv /hive
schematool -initSchema -dbType derby
rm metastore_db -fR
hive
CREATE TABLE temp_colab (texto String);
LOAD DATA INPATH '/hive/empregados.csv' OVERWRITE INTO TABLE temp_colab;
SELECT * FROM temp_colab;
CREATE TABLE IF NOT EXISTS colaboradores(
id int,
nome String,
cargo String,
salario double,
cidade String
);
insert overwrite table colaboradores
SELECT
regexp_extract(texto, '^(?:([^,]*),?){1}', 1) ID,
regexp_extract(texto, '^(?:([^,]*),?){2}', 1) nome,
regexp_extract(texto, '^(?:([^,]*),?){3}', 1) cargo,
regexp_extract(texto, '^(?:([^,]*),?){4}', 1) salario,
regexp_extract(texto, '^(?:([^,]*),?){5}', 1) cidade
from temp_colab;
SELECT * FROM colaboradores;
SELECT * FROM colaboradores WHERE Id = 3002;
SELECT sum(salario), cidade from colaboradores group by cidade;
hdfs dfs -mkdir -p /mahout/input/{ham,spam}
curl https://raw.githubusercontent.com/luvres/hadoop/master/datasets/ham.tar.gz | tar -xzf -
curl https://raw.githubusercontent.com/luvres/hadoop/master/datasets/spam.tar.gz | tar -xzf -
hdfs dfs -copyFromLocal ham/* /mahout/input/ham
hdfs dfs -copyFromLocal spam/* /mahout/input/spam
mahout seqdirectory -i /mahout/input -o /mahout/output/seqoutput
mahout seq2sparse -i /mahout/output/seqoutput -o /mahout/output/sparseoutput
hdfs dfs -ls /mahout/output/sparseoutput
mahout split -i /mahout/output/sparseoutput/tfidf-vectors --trainingOutput /mahout/nbTrain --testOutput /mahout/nbTest --randomSelectionPct 30 --overwrite --sequenceFiles -xm sequencial
mahout trainnb -i /mahout/nbTrain -li /mahout/nbLabels -o /mahout/nbmodel -ow -c
mahout testnb -i /mahout/nbTest -m /mahout/nbmodel -l /mahout/nbLabels -ow -o /mahout/nbpredictions -c
hdfs dfs -mkdir -p /mahout/clustering/data
curl https://raw.githubusercontent.com/luvres/hadoop/master/datasets/news.tar.gz | tar -xzf -
hdfs dfs -copyFromLocal news/* /mahout/clustering/data
mahout seqdirectory -i /mahout/clustering/data -o /mahout/clustering/kmeansseq
mahout seq2sparse -i /mahout/clustering/kmeansseq -o /mahout/clustering/kmeanssparse
hdfs dfs -ls /mahout/clustering/kmeanssparse
mahout kmeans -i /mahout/clustering/kmeanssparse/tfidf-vectors/ -c /mahout/clustering/kmeanscentroids -cl -o /mahout/clustering/kmeansclusters -k 3 -ow -x 10 -dm org.apache.mahout.common.distance.CosineDistanceMeasure
hdfs dfs -ls /mahout/clustering/kmeansclusters
mahout clusterdump -d /mahout/clustering/kmeanssparse/dictionary.file-0 -dt sequencefile -i /mahout/clustering/kmeansclusters/clusters-1-final -n 20 -b 100 -o clusterdump.txt -p /mahout/clustering/kmeansclusters/clusteredPoints/
cat clisterdump.txt
http://localhost:8888
http://localhost:4040
http://localhost:8787
username: root
password: root
zoneCluster pseudo
docker run --rm --name Hadoop -h hadoop \
-p 8088:8088 -p 8042:8042 -p 50070:50070 -p 8888:8888 -p 4040:4040 \
-v $HOME/notebooks:/root/notebooks \
-ti izone/hadoop:ecosystem bash
http://localhost:8888/terminals/1
bash
curl -O https://raw.githubusercontent.com/luvres/hadoop/master/julia/dataset/multilinreg.jl
curl -O https://raw.githubusercontent.com/luvres/hadoop/master/julia/dataset/data.txt
julia multilinreg.jl
docker pull izone/hadoop
docker run --rm --name Hadoop -h hadoop \
-p 8088:8088 \
-p 8042:8042 \
-p 50070:50070 \
-ti izone/hadoop -test bash
docker pull izone/hadoop:cos7
docker pull izone/hadoop:cos6
docker pull izone/hadoop:alpine
docker run --rm --name Hadoop -h hadoop \
-p 8088:8088 \
-p 8042:8042 \
-p 50070:50070 \
-ti izone/hadoop:alpine -test bash
hdfs dfs -mkdir /bigdata
hadoop fs -ls /
wget -c http://compras.dados.gov.br/contratos/v1/contratos.csv
hadoop fs -copyFromLocal contratos.csv /bigdata
hadoop fs -cat /bigdata/contratos.csv
hadoop jar /opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.5.jar wordcount /bigdata/contratos.csv /output
hdfs dfs -cat /output/*
http://localhost:8888/
new -> python
!mkdir datasets
!curl -L http://www.gutenberg.org/files/11/11-0.txt -o datasets/book.txt
!hdfs dfs -mkdir -p /spark/input
!hdfs dfs -put datasets/book.txt /spark/input
!hdfs dfs -ls /spark/input
Examples of http://spark.apache.org/examples.html
text_file = sc.textFile("hdfs://localhost:9000/spark/input/book.txt")
counts = text_file.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b)
counts.saveAsTextFile("hdfs://localhost:9000/spark/output"
!hdfs dfs -ls /spark/output
!hdfs dfs -cat /spark/output/part-00000
export HADOOP_CONF_DIR=/etc/hadoop/conf
export YARN_CONF_DIR=/etc/hadoop/conf
spark-submit --class org.apache.spark.examples.SparkPi --master yarn-cluster $SPARK_HOME/examples/jars/spark-examples_2.11-2.0.2.jar 10
docker run --rm --name Hadoop -h hadoop \
-p 8088:8088 \
-p 8042:8042 \
-p 50070:50070 \
-p 8888:8888 \
-p 4040:4040 \
-v $HOME/notebooks:/root/notebooks \
-ti izone/hadoop:anaconda bash
docker run --rm --name Hadoop -h hadoop \
-p 8088:8088 \
-p 8042:8042 \
-p 50070:50070 \
-p 8888:8888 \
-p 4040:4040 \
-p 8787:8787 \
-v $HOME/notebooks:/root/notebooks \
-ti izone/hadoop:rstudio bash
git clone https://github.com/luvres/hadoop.git
cd hadoop
docker build -t izone/hadoop . && \
docker build -t izone/hadoop:anaconda ./anaconda/ && \
docker build -t izone/hadoop:rstudio ./rstudio/ && \
docker build -t izone/hadoop:julia ./julia/ && \
docker build -t izone/hadoop:ecosystem ./ecosystem/ && \
docker build -t izone/hadoop:cluster ./cluster/ && \
docker build -t izone/hadoop:datanode ./cluster/datanode/
git clone https://github.com/luvres/hadoop.git
cd hadoop
docker build -t izone/hadoop:cos7 ./centos7/ && \
docker build -t izone/hadoop:cos7-miniconda ./centos7/miniconda/ && \
docker build -t izone/hadoop:cos7-ecosystem ./centos7/ecosystem/ && \
docker build -t izone/hadoop:cos7-anaconda ./centos7/anaconda/ && \
docker build -t izone/hadoop:cos7-mahout ./centos7/mahout/ && \
docker build -t izone/hadoop:cos7-cluster ./centos7/cluster/ && \
docker build -t izone/hadoop:cos7-datanode ./centos7/cluster/datanode/
git clone https://github.com/luvres/hadoop.git
cd hadoop
docker build -t izone/hadoop:cos6 ./centos6/ && \
docker build -t izone/hadoop:cos6-miniconda ./centos6/miniconda/ && \
docker build -t izone/hadoop:cos6-ecosystem ./centos6/ecosystem/ && \
docker build -t izone/hadoop:cos6-anaconda ./centos6/anaconda/ && \
docker build -t izone/hadoop:cos6-rstudio ./centos6/rstudio/ && \
docker build -t izone/hadoop:cos6-mahout ./centos6/mahout/ && \
docker build -t izone/hadoop:cos6-cluster ./centos6/cluster/ && \
docker build -t izone/hadoop:cos6-datanode ./centos6/cluster/datanode/
git clone https://github.com/luvres/hadoop.git
cd hadoop
docker build -t izone/hadoop:alpine ./alpine/ && \
docker build -t izone/hadoop:alpine-datanode ./alpine/datanode/