R테스트용 코드
Sys.setenv(HADOOP_CMD="/usr/local/hadoop/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.9.2.jar")
Sys.setenv(HADOOP_HOME='/usr/local/hadoop')
Sys.setenv(JAVA_HOME='/usr/lib/jvm/java-1.8.0-openjdk-arm64')
export HADOOP_USER_NAME=ubuntu1
library(rhdfs)
hdfs.init()
library(rmr2)
small.ints <- to.dfs(1:10)
from.dfs(small.ints)
result <- mapreduce(input = small.ints, map=function(k,v) cbind(v,v^2))
out <- from.dfs(result)
out
# rmr2가 의존하는 패키지 설치
install.packages(c("Rcpp","RJSONIO","digest","functional","reshape2","stringr","plyr","caTools"))
# rmr2 설치
install.packages("rmr2_3.3.1.tar.gz")
# plyrmr이 의존하는 패키지 설치
install.packages(c("dplyr","R.methodsS3","Hmisc","memoise","lazyeval","rjson"))
# plyrmr 설치
install.packages("plyrmr_0.6.0.tar.gz")
# rhdfs가 의존하는 패키지 설치
install.packages("rJava")
# 환경 변수 설정
Sys.setenv(HADOOP_CMD="/usr/local/hadoop/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.9.2.jar")
# rhdfs 설치
library(rJava)
install.packages("rhdfs_1.0.8.tar.gz")
프록시 서버 켜기
[yarn-daemon.sh](<http://yarn-deamon.sh/>) start proxyserver
,2,3,45에도
R패키지 codetools 설치 필요
하둡 동작 확인
https://tecadmin.net/hadoop-running-a-wordcount-mapreduce-example/
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.2.jar wordcount input output
ssh작업
ssh-keyscan -H ubuntu1 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu1 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu1
ssh-keyscan -H ubuntu2 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu2 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu2
ssh-keyscan -H ubuntu3 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu3 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu3
ssh-keyscan -H ubuntu4 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu4 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu4
ssh-keyscan -H ubuntu5 >> .ssh/known_hosts
sshpass -p "R2020hjp" ssh ubuntu1@ubuntu5 -y 'exit'
sshpass -p "R2020hjp" ssh-copy-id ubuntu1@ubuntu5
sudo wget -O rhdfs.tar.gz <https://github.com/RevolutionAnalytics/rhdfs/blob/master/build/rhdfs_1.0.8.tar.gz?raw=true>
sudo wget -O plyrmr.tar.gz <https://github.com/RevolutionAnalytics/plyrmr/releases/download/0.6.0/plyrmr_0.6.0.tar.gz>
sudo wget -O rmr2.tar.gz <https://github.com/RevolutionAnalytics/rmr2/releases/download/3.3.1/rmr2_3.3.1.tar.gz>
# rmr2가 의존하는 패키지 설치
install.packages(c("Rcpp","RJSONIO","digest","functional","reshape2","stringr","plyr","caTools"))
# rmr2 설치
install.packages("rmr2_3.3.1.tar.gz")
# plyrmr이 의존하는 패키지 설치
install.packages(c("dplyr","R.methodsS3","Hmisc","memoise","lazyeval","rjson"))
# plyrmr 설치
install.packages("plyrmr_0.6.0.tar.gz")
# rhdfs가 의존하는 패키지 설치
install.packages("rJava")
# 환경 변수 설정
Sys.setenv(HADOOP_CMD="/home/ubuntu1/hadoop/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/home/ubuntu1/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.8.0.jar")
# rhdfs 설치
library(rJava)
install.packages("rhdfs_1.0.8.tar.gz")
Sys.setenv(HADOOP_CMD="/home/ubuntu1/hadoop/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/home/ubuntu1/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.8.0.jar")
Sys.setenv(HADOOP_HOME='/home/ubuntu1/hadoop')
Sys.setenv(JAVA_HOME='/usr/lib/jvm/java-1.8.0-openjdk-arm64')
library(rhdfs)
hdfs.init()
library(rmr2)
small.ints <- to.dfs(1:10)
from.dfs(small.ints)
result <- mapreduce(input = small.ints, map=function(k,v) cbind(v,v^2))
out <- from.dfs(result)
out
모든 노드