Load necessary packages into library

library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(caTools)
library(readr)

#Import test.csv and train.csv datasets

train <- read_csv("train.csv")
## Rows: 10886 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (11): season, holiday, workingday, weather, temp, atemp, humidity, wind...
## dttm  (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(train)
## spec_tbl_df [10,886 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ datetime  : POSIXct[1:10886], format: "2011-01-01 00:00:00" "2011-01-01 01:00:00" ...
##  $ season    : num [1:10886] 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : num [1:10886] 0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: num [1:10886] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : num [1:10886] 1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num [1:10886] 9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num [1:10886] 14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : num [1:10886] 81 80 80 75 75 75 80 86 75 76 ...
##  $ windspeed : num [1:10886] 0 0 0 0 0 ...
##  $ casual    : num [1:10886] 3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: num [1:10886] 13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : num [1:10886] 16 40 32 13 1 1 2 3 8 14 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   datetime = col_datetime(format = ""),
##   ..   season = col_double(),
##   ..   holiday = col_double(),
##   ..   workingday = col_double(),
##   ..   weather = col_double(),
##   ..   temp = col_double(),
##   ..   atemp = col_double(),
##   ..   humidity = col_double(),
##   ..   windspeed = col_double(),
##   ..   casual = col_double(),
##   ..   registered = col_double(),
##   ..   count = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
test <- read_csv("test.csv")
## Rows: 6493 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (8): season, holiday, workingday, weather, temp, atemp, humidity, winds...
## dttm (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(test)
## spec_tbl_df [6,493 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ datetime  : POSIXct[1:6493], format: "2011-01-20 00:00:00" "2011-01-20 01:00:00" ...
##  $ season    : num [1:6493] 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : num [1:6493] 0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: num [1:6493] 1 1 1 1 1 1 1 1 1 1 ...
##  $ weather   : num [1:6493] 1 1 1 1 1 1 1 1 1 2 ...
##  $ temp      : num [1:6493] 10.7 10.7 10.7 10.7 10.7 ...
##  $ atemp     : num [1:6493] 11.4 13.6 13.6 12.9 12.9 ...
##  $ humidity  : num [1:6493] 56 56 56 56 56 60 60 55 55 52 ...
##  $ windspeed : num [1:6493] 26 0 0 11 11 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   datetime = col_datetime(format = ""),
##   ..   season = col_double(),
##   ..   holiday = col_double(),
##   ..   workingday = col_double(),
##   ..   weather = col_double(),
##   ..   temp = col_double(),
##   ..   atemp = col_double(),
##   ..   humidity = col_double(),
##   ..   windspeed = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data pre-processsing

# Convert season, holiday, workingday, and weather into factor
train$season <- as.factor(train$season)
test$season <- as.factor(test$season)
train$holiday <- as.factor(train$holiday)
test$holiday <- as.factor(test$holiday)
train$workingday <- as.factor(train$workingday)
test$workingday <- as.factor(test$workingday)
train$weather <- as.factor(train$weather)
test$weather <- as.factor(test$weather)
# Add a column for day of week in train.csv & test.csv dataset
train$day <- format(as.POSIXct(train$datetime), format = '%A')
test$day <- format(as.POSIXct(test$datetime), format = '%A')
aggregate(train[,"count"],list(train$day),mean)
##     Group.1    count
## 1    Friday 197.8443
## 2    Monday 190.3907
## 3  Saturday 196.6654
## 4    Sunday 180.8398
## 5  Thursday 197.2962
## 6   Tuesday 189.7238
## 7 Wednesday 188.4113
# Create Sunday variable
train$sunday <- ifelse(train$day == 'Sunday',1,0)
train$sunday <- as.factor(train$sunday)
test$sunday <- ifelse(test$day == 'Sunday',1,0)
test$sunday <- as.factor(test$sunday)
# Add a column for time in train.csv & test.csv dataset
train$time <- format(as.POSIXct(train$datetime), format = '%H')
train$time <- as.numeric(train$time)
train$time <- as.factor(train$time)
test$time <- format(as.POSIXct(test$datetime), format = '%H')
test$time <- as.numeric(test$time)
test$time <- as.factor(test$time)
# Add a column for temperature squared in train.csv & test.csv dataset
train$tempsq <- train$temp*train$temp
test$tempsq <- test$temp*test$temp
# Holdout-validation approach for evaluating model performance (70/30 split)
# Divide the train.csv dataset into two groups: t.test and t.train
# Stratified random sampling 
library(caret) 
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(3456)
data_part <- createDataPartition(y=train$weather, times = 1, p = 0.7, list = F)
## Warning in createDataPartition(y = train$weather, times = 1, p = 0.7, list =
## F): Some classes have a single record ( 4 ) and these will be selected for the
## sample
t.train = train[data_part,] #Create the train dataset
dim(t.train)
## [1] 7622   16
t.test = train[-data_part,] #Create the test dataset
dim(t.test)
## [1] 3264   16

A. MULTIPLE LINEAR REGRESSION

# Linear regression model
factor <- (count~season+workingday+weather+temp+tempsq+humidity+time+sunday)
reg <- lm(factor,data=t.train)
summary(reg)
## 
## Call:
## lm(formula = factor, data = t.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -372.89  -62.29   -9.77   51.22  500.26 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.885e+01  1.129e+01  -3.442 0.000581 ***
## season2      3.423e+01  4.718e+00   7.255 4.43e-13 ***
## season3      1.681e+01  5.972e+00   2.815 0.004891 ** 
## season4      6.460e+01  3.960e+00  16.312  < 2e-16 ***
## workingday1 -8.090e+00  3.380e+00  -2.394 0.016696 *  
## weather2    -6.584e+00  3.125e+00  -2.107 0.035137 *  
## weather3    -6.591e+01  5.244e+00 -12.568  < 2e-16 ***
## weather4    -1.234e+02  1.108e+02  -1.114 0.265457    
## temp         7.323e+00  9.456e-01   7.745 1.08e-14 ***
## tempsq      -9.873e-03  2.309e-02  -0.428 0.668903    
## humidity    -8.468e-01  8.782e-02  -9.642  < 2e-16 ***
## time1       -1.100e+01  8.809e+00  -1.249 0.211795    
## time2       -1.776e+01  8.873e+00  -2.002 0.045304 *  
## time3       -3.654e+01  8.987e+00  -4.066 4.83e-05 ***
## time4       -3.465e+01  8.869e+00  -3.907 9.44e-05 ***
## time5       -1.873e+01  8.987e+00  -2.084 0.037149 *  
## time6        3.782e+01  8.850e+00   4.274 1.95e-05 ***
## time7        1.769e+02  8.825e+00  20.042  < 2e-16 ***
## time8        3.097e+02  8.840e+00  35.029  < 2e-16 ***
## time9        1.624e+02  8.797e+00  18.461  < 2e-16 ***
## time10       1.095e+02  8.852e+00  12.364  < 2e-16 ***
## time11       1.303e+02  8.894e+00  14.647  < 2e-16 ***
## time12       1.746e+02  8.996e+00  19.404  < 2e-16 ***
## time13       1.646e+02  9.039e+00  18.208  < 2e-16 ***
## time14       1.439e+02  9.147e+00  15.731  < 2e-16 ***
## time15       1.547e+02  9.107e+00  16.989  < 2e-16 ***
## time16       2.153e+02  9.100e+00  23.656  < 2e-16 ***
## time17       3.870e+02  9.051e+00  42.752  < 2e-16 ***
## time18       3.477e+02  8.957e+00  38.823  < 2e-16 ***
## time19       2.355e+02  8.890e+00  26.490  < 2e-16 ***
## time20       1.587e+02  8.833e+00  17.967  < 2e-16 ***
## time21       1.066e+02  8.812e+00  12.096  < 2e-16 ***
## time22       7.240e+01  8.762e+00   8.262  < 2e-16 ***
## time23       3.554e+01  8.771e+00   4.051 5.14e-05 ***
## sunday1     -2.398e+01  4.474e+00  -5.361 8.51e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 110.5 on 7587 degrees of freedom
## Multiple R-squared:  0.6323, Adjusted R-squared:  0.6307 
## F-statistic: 383.8 on 34 and 7587 DF,  p-value: < 2.2e-16
# Create the evaluation metrics function
eval_results <- function(true, predicted, df) {
                SSE <- sum((predicted - true)^2)
                SST <- sum((true - mean(true))^2)
                R_square <- 1 - SSE / SST
                RMSE = sqrt(SSE/nrow(df))
                data.frame(
                RMSE = RMSE,
                Rsquare = R_square)
}
# Predicting and evaluating the model on the train subset of the train.csv data
predictions1 = predict(reg, newdata = t.train)
eval_results(t.train$count, predictions1, t.train)
##       RMSE  Rsquare
## 1 110.2144 0.632346
# Predicting and evaluating the model on test subset of the train.csv data
predictions2 = predict(reg, newdata = t.test)
eval_results(t.test$count,predictions2, t.test)
##       RMSE   Rsquare
## 1 109.6384 0.6275456

B. Lasso

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-2
Mx<- model.matrix(count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday, data=t.train)
My<- t.train$count
Mx_test <- model.matrix(count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday, data=t.test)
My_test<-t.test$count
lambdas <- 10^seq(2, -3, by = -.1)
# Setting alpha = 1 implements lasso regression
lasso_reg <- cv.glmnet(Mx,My, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)
# Best lambda
lambda_best <- lasso_reg$lambda.min 
lambda_best
## [1] 0.001
lasso_model <- glmnet(Mx,My, alpha = 1, lambda = lambda_best, standardize = TRUE)
predictions3 <- predict(lasso_model, s = lambda_best, newx = Mx)
eval_results(t.train$count,predictions3,t.train)
##       RMSE   Rsquare
## 1 110.0277 0.6335902
predictions4 <- predict(lasso_model, s = lambda_best, newx = Mx_test)
eval_results(t.test$count, predictions4, t.test)
##       RMSE   Rsquare
## 1 109.3803 0.6292975

C. CART

library(rpart)
# Fit the model on the t.train set
set.seed(123)
cart <- train(
  count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday,
  data=t.train, method = "rpart",
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
  )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
# Print the best tuning parameter cp that
# minimize the model RMSE
cart$bestTune
##           cp
## 1 0.01095693
# Decision rules in the model
cart$finalModel
## n= 7622 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 7622 251829300.0 191.33880  
##    2) atemp< 30.6825 5597 133027100.0 150.26750  
##      4) time8< 0.5 5339 109286600.0 140.92040  
##        8) time17< 0.5 5151  90862450.0 132.38250  
##         16) time18< 0.5 4954  74771260.0 124.00790  
##           32) temp< 12.71 1348   7734817.0  70.37908 *
##           33) temp>=12.71 3606  61710260.0 144.05550  
##             66) humidity>=65.5 2069  28621250.0 111.70710 *
##             67) humidity< 65.5 1537  28009550.0 187.60050 *
##         17) time18>=0.5 197   7006558.0 342.97970 *
##        9) time17>=0.5 188   7760670.0 374.85110 *
##      5) time8>=0.5 258  13621280.0 343.69380  
##       10) workingday1< 0.5 89    832546.8 113.37080 *
##       11) workingday1>=0.5 169   5581014.0 464.98820 *
##    3) atemp>=30.6825 2025  83265560.0 304.85780  
##      6) time17< 0.5 1896  64784810.0 283.05700  
##       12) time18< 0.5 1768  49226750.0 262.24660  
##         24) humidity>=55.5 829  21365930.0 206.50660 *
##         25) humidity< 55.5 939  23011240.0 311.45690  
##           50) workingday1>=0.5 663  10961380.0 262.71950 *
##           51) workingday1< 0.5 276   6691951.0 428.53260 *
##       13) time18>=0.5 128   4216578.0 570.50000 *
##      7) time17>=0.5 129   4335218.0 625.27910 *
# Make predictions on the test data
predictions5 <- cart %>% predict(t.train)
eval_results(t.train$count,predictions5,t.train)
##       RMSE   Rsquare
## 1 132.1548 0.4713979
predictions6 <- cart %>% predict(t.test)
eval_results(t.test$count,predictions6,t.test)
##       RMSE   Rsquare
## 1 134.3513 0.4407173

D. RANDOM FOREST

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Using random forest to rank variables according to their importance
rf <- randomForest(count~season+holiday+workingday+weather+temp+tempsq+humidity+windspeed+time+sunday,data=t.train,importance = TRUE)
imp <- importance(rf, type=1)
featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1])
# Graph variables based on its importance
ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
  geom_bar(stat="identity", fill="#53cfff") +
  coord_flip() + 
  theme_light(base_size=20) +
  xlab("Importance") +
  ylab("") + 
  ggtitle("Random Forest Feature Importance\n") +
  theme(plot.title=element_text(size=18))

# Evaluate random forest
predictions7 <- predict(rf, newdata = t.train)
eval_results(t.train$count,predictions7,t.train)
##       RMSE  Rsquare
## 1 42.00505 0.946597
predictions8 <- predict(rf, newdata = t.test)
eval_results(t.test$count, predictions8,t.test)
##       RMSE   Rsquare
## 1 72.98046 0.8349708

Use random forest for prediction

# Predict count on test.csv using random forest
finalpredictions <- round(predict(rf, newdata = test))
result <- data.frame(datetime = test$datetime, count=finalpredictions)

Append train.csv and test.csv into one dataset so we have the complete data

test$count <- finalpredictions
complete <- dplyr::bind_rows(train, test)

Visualization

Visualize bike demand according to seasons

ggplot()+ aes(x=factor(complete$season),y=complete$count)+geom_bar(stat="identity",fill="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma) + scale_x_discrete(name="Seasons",labels = c("Spring", "Summer", "Fall", "Winter")) + ggtitle("Bike Demand based on Seasons")

Visualize bike demand according to hours

ggplot()+ aes(x=factor(complete$time),y=complete$count)+geom_bar(stat="identity",color="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma)+xlab("Time")+ggtitle("Bike Demand based on Hours")

Visualize bike demand according to weather patterns

ggplot()+ aes(x=factor(complete$weather),y=complete$count)+geom_bar(stat="identity",color="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma)+scale_x_discrete(name="Weather patterns",labels = c("Clear, partly cloudy","Mist, cloudy","Light snow, light rain","Heavy rain, snow"))+ggtitle("Bike Demand based on Weather patterns")

Visualize bike demand according to temp

ggplot(data = complete, mapping = aes(x = temp, y = count, color = temp)) +
geom_point(alpha = 1) + scale_colour_gradientn(colors=rainbow(4), trans="reverse") + xlab("Temperature") + ylab ("Count of Bikes") + ggtitle("Bike Demand based on Temp")

Visualize bike demand according to types of users

ggplot(train, aes(x=time, y=count,)) + 
  geom_boxplot() + xlab("hour") + ylab("count") + ggtitle("Hourly Bike Demand for Total Users")

ggplot(train, aes(x=time, y=casual)) + 
  geom_boxplot() + xlab("hour") + ylab("casual users")+ ggtitle("Hourly Bike Demand for Casual Users")

ggplot(train, aes(x=time, y=registered)) + 
  geom_boxplot() + xlab("hour") + ylab("registered users")+ggtitle("Hourly Bike Demand for Registered Users")