library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(caTools)
library(readr)
#Import test.csv and train.csv datasets
train <- read_csv("train.csv")
## Rows: 10886 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): season, holiday, workingday, weather, temp, atemp, humidity, wind...
## dttm (1): datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(train)
## spec_tbl_df [10,886 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ datetime : POSIXct[1:10886], format: "2011-01-01 00:00:00" "2011-01-01 01:00:00" ...
## $ season : num [1:10886] 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : num [1:10886] 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: num [1:10886] 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : num [1:10886] 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num [1:10886] 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num [1:10886] 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : num [1:10886] 81 80 80 75 75 75 80 86 75 76 ...
## $ windspeed : num [1:10886] 0 0 0 0 0 ...
## $ casual : num [1:10886] 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: num [1:10886] 13 32 27 10 1 1 0 2 7 6 ...
## $ count : num [1:10886] 16 40 32 13 1 1 2 3 8 14 ...
## - attr(*, "spec")=
## .. cols(
## .. datetime = col_datetime(format = ""),
## .. season = col_double(),
## .. holiday = col_double(),
## .. workingday = col_double(),
## .. weather = col_double(),
## .. temp = col_double(),
## .. atemp = col_double(),
## .. humidity = col_double(),
## .. windspeed = col_double(),
## .. casual = col_double(),
## .. registered = col_double(),
## .. count = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
test <- read_csv("test.csv")
## Rows: 6493 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (8): season, holiday, workingday, weather, temp, atemp, humidity, winds...
## dttm (1): datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(test)
## spec_tbl_df [6,493 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ datetime : POSIXct[1:6493], format: "2011-01-20 00:00:00" "2011-01-20 01:00:00" ...
## $ season : num [1:6493] 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : num [1:6493] 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: num [1:6493] 1 1 1 1 1 1 1 1 1 1 ...
## $ weather : num [1:6493] 1 1 1 1 1 1 1 1 1 2 ...
## $ temp : num [1:6493] 10.7 10.7 10.7 10.7 10.7 ...
## $ atemp : num [1:6493] 11.4 13.6 13.6 12.9 12.9 ...
## $ humidity : num [1:6493] 56 56 56 56 56 60 60 55 55 52 ...
## $ windspeed : num [1:6493] 26 0 0 11 11 ...
## - attr(*, "spec")=
## .. cols(
## .. datetime = col_datetime(format = ""),
## .. season = col_double(),
## .. holiday = col_double(),
## .. workingday = col_double(),
## .. weather = col_double(),
## .. temp = col_double(),
## .. atemp = col_double(),
## .. humidity = col_double(),
## .. windspeed = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Convert season, holiday, workingday, and weather into factor
train$season <- as.factor(train$season)
test$season <- as.factor(test$season)
train$holiday <- as.factor(train$holiday)
test$holiday <- as.factor(test$holiday)
train$workingday <- as.factor(train$workingday)
test$workingday <- as.factor(test$workingday)
train$weather <- as.factor(train$weather)
test$weather <- as.factor(test$weather)
# Add a column for day of week in train.csv & test.csv dataset
train$day <- format(as.POSIXct(train$datetime), format = '%A')
test$day <- format(as.POSIXct(test$datetime), format = '%A')
aggregate(train[,"count"],list(train$day),mean)
## Group.1 count
## 1 Friday 197.8443
## 2 Monday 190.3907
## 3 Saturday 196.6654
## 4 Sunday 180.8398
## 5 Thursday 197.2962
## 6 Tuesday 189.7238
## 7 Wednesday 188.4113
# Create Sunday variable
train$sunday <- ifelse(train$day == 'Sunday',1,0)
train$sunday <- as.factor(train$sunday)
test$sunday <- ifelse(test$day == 'Sunday',1,0)
test$sunday <- as.factor(test$sunday)
# Add a column for time in train.csv & test.csv dataset
train$time <- format(as.POSIXct(train$datetime), format = '%H')
train$time <- as.numeric(train$time)
train$time <- as.factor(train$time)
test$time <- format(as.POSIXct(test$datetime), format = '%H')
test$time <- as.numeric(test$time)
test$time <- as.factor(test$time)
# Add a column for temperature squared in train.csv & test.csv dataset
train$tempsq <- train$temp*train$temp
test$tempsq <- test$temp*test$temp
# Holdout-validation approach for evaluating model performance (70/30 split)
# Divide the train.csv dataset into two groups: t.test and t.train
# Stratified random sampling
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(3456)
data_part <- createDataPartition(y=train$weather, times = 1, p = 0.7, list = F)
## Warning in createDataPartition(y = train$weather, times = 1, p = 0.7, list =
## F): Some classes have a single record ( 4 ) and these will be selected for the
## sample
t.train = train[data_part,] #Create the train dataset
dim(t.train)
## [1] 7622 16
t.test = train[-data_part,] #Create the test dataset
dim(t.test)
## [1] 3264 16
# Linear regression model
factor <- (count~season+workingday+weather+temp+tempsq+humidity+time+sunday)
reg <- lm(factor,data=t.train)
summary(reg)
##
## Call:
## lm(formula = factor, data = t.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -372.89 -62.29 -9.77 51.22 500.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.885e+01 1.129e+01 -3.442 0.000581 ***
## season2 3.423e+01 4.718e+00 7.255 4.43e-13 ***
## season3 1.681e+01 5.972e+00 2.815 0.004891 **
## season4 6.460e+01 3.960e+00 16.312 < 2e-16 ***
## workingday1 -8.090e+00 3.380e+00 -2.394 0.016696 *
## weather2 -6.584e+00 3.125e+00 -2.107 0.035137 *
## weather3 -6.591e+01 5.244e+00 -12.568 < 2e-16 ***
## weather4 -1.234e+02 1.108e+02 -1.114 0.265457
## temp 7.323e+00 9.456e-01 7.745 1.08e-14 ***
## tempsq -9.873e-03 2.309e-02 -0.428 0.668903
## humidity -8.468e-01 8.782e-02 -9.642 < 2e-16 ***
## time1 -1.100e+01 8.809e+00 -1.249 0.211795
## time2 -1.776e+01 8.873e+00 -2.002 0.045304 *
## time3 -3.654e+01 8.987e+00 -4.066 4.83e-05 ***
## time4 -3.465e+01 8.869e+00 -3.907 9.44e-05 ***
## time5 -1.873e+01 8.987e+00 -2.084 0.037149 *
## time6 3.782e+01 8.850e+00 4.274 1.95e-05 ***
## time7 1.769e+02 8.825e+00 20.042 < 2e-16 ***
## time8 3.097e+02 8.840e+00 35.029 < 2e-16 ***
## time9 1.624e+02 8.797e+00 18.461 < 2e-16 ***
## time10 1.095e+02 8.852e+00 12.364 < 2e-16 ***
## time11 1.303e+02 8.894e+00 14.647 < 2e-16 ***
## time12 1.746e+02 8.996e+00 19.404 < 2e-16 ***
## time13 1.646e+02 9.039e+00 18.208 < 2e-16 ***
## time14 1.439e+02 9.147e+00 15.731 < 2e-16 ***
## time15 1.547e+02 9.107e+00 16.989 < 2e-16 ***
## time16 2.153e+02 9.100e+00 23.656 < 2e-16 ***
## time17 3.870e+02 9.051e+00 42.752 < 2e-16 ***
## time18 3.477e+02 8.957e+00 38.823 < 2e-16 ***
## time19 2.355e+02 8.890e+00 26.490 < 2e-16 ***
## time20 1.587e+02 8.833e+00 17.967 < 2e-16 ***
## time21 1.066e+02 8.812e+00 12.096 < 2e-16 ***
## time22 7.240e+01 8.762e+00 8.262 < 2e-16 ***
## time23 3.554e+01 8.771e+00 4.051 5.14e-05 ***
## sunday1 -2.398e+01 4.474e+00 -5.361 8.51e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 110.5 on 7587 degrees of freedom
## Multiple R-squared: 0.6323, Adjusted R-squared: 0.6307
## F-statistic: 383.8 on 34 and 7587 DF, p-value: < 2.2e-16
# Create the evaluation metrics function
eval_results <- function(true, predicted, df) {
SSE <- sum((predicted - true)^2)
SST <- sum((true - mean(true))^2)
R_square <- 1 - SSE / SST
RMSE = sqrt(SSE/nrow(df))
data.frame(
RMSE = RMSE,
Rsquare = R_square)
}
# Predicting and evaluating the model on the train subset of the train.csv data
predictions1 = predict(reg, newdata = t.train)
eval_results(t.train$count, predictions1, t.train)
## RMSE Rsquare
## 1 110.2144 0.632346
# Predicting and evaluating the model on test subset of the train.csv data
predictions2 = predict(reg, newdata = t.test)
eval_results(t.test$count,predictions2, t.test)
## RMSE Rsquare
## 1 109.6384 0.6275456
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-2
Mx<- model.matrix(count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday, data=t.train)
My<- t.train$count
Mx_test <- model.matrix(count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday, data=t.test)
My_test<-t.test$count
lambdas <- 10^seq(2, -3, by = -.1)
# Setting alpha = 1 implements lasso regression
lasso_reg <- cv.glmnet(Mx,My, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)
# Best lambda
lambda_best <- lasso_reg$lambda.min
lambda_best
## [1] 0.001
lasso_model <- glmnet(Mx,My, alpha = 1, lambda = lambda_best, standardize = TRUE)
predictions3 <- predict(lasso_model, s = lambda_best, newx = Mx)
eval_results(t.train$count,predictions3,t.train)
## RMSE Rsquare
## 1 110.0277 0.6335902
predictions4 <- predict(lasso_model, s = lambda_best, newx = Mx_test)
eval_results(t.test$count, predictions4, t.test)
## RMSE Rsquare
## 1 109.3803 0.6292975
library(rpart)
# Fit the model on the t.train set
set.seed(123)
cart <- train(
count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday,
data=t.train, method = "rpart",
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
# Print the best tuning parameter cp that
# minimize the model RMSE
cart$bestTune
## cp
## 1 0.01095693
# Decision rules in the model
cart$finalModel
## n= 7622
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 7622 251829300.0 191.33880
## 2) atemp< 30.6825 5597 133027100.0 150.26750
## 4) time8< 0.5 5339 109286600.0 140.92040
## 8) time17< 0.5 5151 90862450.0 132.38250
## 16) time18< 0.5 4954 74771260.0 124.00790
## 32) temp< 12.71 1348 7734817.0 70.37908 *
## 33) temp>=12.71 3606 61710260.0 144.05550
## 66) humidity>=65.5 2069 28621250.0 111.70710 *
## 67) humidity< 65.5 1537 28009550.0 187.60050 *
## 17) time18>=0.5 197 7006558.0 342.97970 *
## 9) time17>=0.5 188 7760670.0 374.85110 *
## 5) time8>=0.5 258 13621280.0 343.69380
## 10) workingday1< 0.5 89 832546.8 113.37080 *
## 11) workingday1>=0.5 169 5581014.0 464.98820 *
## 3) atemp>=30.6825 2025 83265560.0 304.85780
## 6) time17< 0.5 1896 64784810.0 283.05700
## 12) time18< 0.5 1768 49226750.0 262.24660
## 24) humidity>=55.5 829 21365930.0 206.50660 *
## 25) humidity< 55.5 939 23011240.0 311.45690
## 50) workingday1>=0.5 663 10961380.0 262.71950 *
## 51) workingday1< 0.5 276 6691951.0 428.53260 *
## 13) time18>=0.5 128 4216578.0 570.50000 *
## 7) time17>=0.5 129 4335218.0 625.27910 *
# Make predictions on the test data
predictions5 <- cart %>% predict(t.train)
eval_results(t.train$count,predictions5,t.train)
## RMSE Rsquare
## 1 132.1548 0.4713979
predictions6 <- cart %>% predict(t.test)
eval_results(t.test$count,predictions6,t.test)
## RMSE Rsquare
## 1 134.3513 0.4407173
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Using random forest to rank variables according to their importance
rf <- randomForest(count~season+holiday+workingday+weather+temp+tempsq+humidity+windspeed+time+sunday,data=t.train,importance = TRUE)
imp <- importance(rf, type=1)
featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1])
# Graph variables based on its importance
ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
geom_bar(stat="identity", fill="#53cfff") +
coord_flip() +
theme_light(base_size=20) +
xlab("Importance") +
ylab("") +
ggtitle("Random Forest Feature Importance\n") +
theme(plot.title=element_text(size=18))
# Evaluate random forest
predictions7 <- predict(rf, newdata = t.train)
eval_results(t.train$count,predictions7,t.train)
## RMSE Rsquare
## 1 42.00505 0.946597
predictions8 <- predict(rf, newdata = t.test)
eval_results(t.test$count, predictions8,t.test)
## RMSE Rsquare
## 1 72.98046 0.8349708
# Predict count on test.csv using random forest
finalpredictions <- round(predict(rf, newdata = test))
result <- data.frame(datetime = test$datetime, count=finalpredictions)
test$count <- finalpredictions
complete <- dplyr::bind_rows(train, test)
ggplot()+ aes(x=factor(complete$season),y=complete$count)+geom_bar(stat="identity",fill="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma) + scale_x_discrete(name="Seasons",labels = c("Spring", "Summer", "Fall", "Winter")) + ggtitle("Bike Demand based on Seasons")
ggplot()+ aes(x=factor(complete$time),y=complete$count)+geom_bar(stat="identity",color="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma)+xlab("Time")+ggtitle("Bike Demand based on Hours")
ggplot()+ aes(x=factor(complete$weather),y=complete$count)+geom_bar(stat="identity",color="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma)+scale_x_discrete(name="Weather patterns",labels = c("Clear, partly cloudy","Mist, cloudy","Light snow, light rain","Heavy rain, snow"))+ggtitle("Bike Demand based on Weather patterns")
ggplot(data = complete, mapping = aes(x = temp, y = count, color = temp)) +
geom_point(alpha = 1) + scale_colour_gradientn(colors=rainbow(4), trans="reverse") + xlab("Temperature") + ylab ("Count of Bikes") + ggtitle("Bike Demand based on Temp")
ggplot(train, aes(x=time, y=count,)) +
geom_boxplot() + xlab("hour") + ylab("count") + ggtitle("Hourly Bike Demand for Total Users")
ggplot(train, aes(x=time, y=casual)) +
geom_boxplot() + xlab("hour") + ylab("casual users")+ ggtitle("Hourly Bike Demand for Casual Users")
ggplot(train, aes(x=time, y=registered)) +
geom_boxplot() + xlab("hour") + ylab("registered users")+ggtitle("Hourly Bike Demand for Registered Users")