R테스트용 코드

Sys.setenv(HADOOP_CMD="/usr/local/hadoop/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.9.2.jar")
Sys.setenv(HADOOP_HOME='/usr/local/hadoop')
Sys.setenv(JAVA_HOME='/usr/lib/jvm/java-1.8.0-openjdk-arm64')
export HADOOP_USER_NAME=ubuntu1

library(rhdfs)
hdfs.init()
library(rmr2)

small.ints <- to.dfs(1:10)
from.dfs(small.ints)
result <- mapreduce(input = small.ints, map=function(k,v) cbind(v,v^2))
out <- from.dfs(result)
out

# rmr2가 의존하는 패키지 설치
install.packages(c("Rcpp","RJSONIO","digest","functional","reshape2","stringr","plyr","caTools"))

# rmr2 설치
install.packages("rmr2_3.3.1.tar.gz")

# plyrmr이 의존하는 패키지 설치
install.packages(c("dplyr","R.methodsS3","Hmisc","memoise","lazyeval","rjson"))

# plyrmr 설치
install.packages("plyrmr_0.6.0.tar.gz")

# rhdfs가 의존하는 패키지 설치
install.packages("rJava")

# 환경 변수 설정 
Sys.setenv(HADOOP_CMD="/usr/local/hadoop/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.9.2.jar")

# rhdfs 설치
library(rJava)
install.packages("rhdfs_1.0.8.tar.gz")

프록시 서버 켜기

[yarn-daemon.sh](<http://yarn-deamon.sh/>) start proxyserver

,2,3,45에도

R패키지 codetools 설치 필요

하둡 동작 확인

https://tecadmin.net/hadoop-running-a-wordcount-mapreduce-example/

hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.2.jar wordcount input output

ssh작업

ssh-keyscan -H ubuntu1 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu1 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu1

ssh-keyscan -H ubuntu2 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu2 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu2

ssh-keyscan -H ubuntu3 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu3 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu3

ssh-keyscan -H ubuntu4 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu4 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu4

ssh-keyscan -H ubuntu5 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu5 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu5
sudo wget -O rhdfs.tar.gz <https://github.com/RevolutionAnalytics/rhdfs/blob/master/build/rhdfs_1.0.8.tar.gz?raw=true>
sudo wget -O plyrmr.tar.gz <https://github.com/RevolutionAnalytics/plyrmr/releases/download/0.6.0/plyrmr_0.6.0.tar.gz>
sudo wget -O rmr2.tar.gz <https://github.com/RevolutionAnalytics/rmr2/releases/download/3.3.1/rmr2_3.3.1.tar.gz>

# rmr2가 의존하는 패키지 설치
install.packages(c("Rcpp","RJSONIO","digest","functional","reshape2","stringr","plyr","caTools"))

# rmr2 설치
install.packages("rmr2_3.3.1.tar.gz")

# plyrmr이 의존하는 패키지 설치
install.packages(c("dplyr","R.methodsS3","Hmisc","memoise","lazyeval","rjson"))

# plyrmr 설치
install.packages("plyrmr_0.6.0.tar.gz")

# rhdfs가 의존하는 패키지 설치
install.packages("rJava")

# 환경 변수 설정 
Sys.setenv(HADOOP_CMD="/home/ubuntu1/hadoop/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/home/ubuntu1/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.8.0.jar")

# rhdfs 설치
library(rJava)
install.packages("rhdfs_1.0.8.tar.gz")

Sys.setenv(HADOOP_CMD="/home/ubuntu1/hadoop/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/home/ubuntu1/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.8.0.jar")
Sys.setenv(HADOOP_HOME='/home/ubuntu1/hadoop')
Sys.setenv(JAVA_HOME='/usr/lib/jvm/java-1.8.0-openjdk-arm64')

library(rhdfs)
hdfs.init()
library(rmr2)

small.ints <- to.dfs(1:10)
from.dfs(small.ints)
result <- mapreduce(input = small.ints, map=function(k,v) cbind(v,v^2))
out <- from.dfs(result)
out

모든 노드