Objectives

What you will get

Lesson 1: Essential R programming

# set working directory
getwd()
dir()

# basic arithmetic
1+1
2*2
3**2

# variable
income <- 10000
food_expense <- 2000
left_over <- income - food_expense

# read csv file into RStudio
df <- read.csv("test.csv")
str(df)
head(df)
tail(df)
write.csv(df, "data.csv")

# install package
install.packages("dplyr")
library(dplyr)

Lesson 2: Clean data

# check missing value
complete.cases(df)
mean(complete.cases(df))

# remove rows with NA
clean_df <- df[complete.cases(df), ]

# mean imputation
x <- c(1:10, NA)
is.na(x)
avg_x <- mean(x, na.rm=TRUE)
x[is.na(x)] <- avg_x

# case study
url <- "<https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data>"
df <- read.table(url, header=FALSE, sep=",", stringAsFactors=FALSE, na.strings="?")
glimpse(df)
mean(complete.cases(df))

# remove rows with NA
clean_df <- df[complete.cases(df), ]
glimpse(clean_df)
mean(complete.cases(clean_df))

Lesson 3: Data Manipulation