Load necessary packages into library

library(data.table)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()

library(caTools)
library(readr)

#Import test.csv and train.csv datasets

train <- read_csv("train.csv")

## Rows: 10886 Columns: 12

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (11): season, holiday, workingday, weather, temp, atemp, humidity, wind...
## dttm  (1): datetime

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(train)

## spec_tbl_df [10,886 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ datetime  : POSIXct[1:10886], format: "2011-01-01 00:00:00" "2011-01-01 01:00:00" ...
##  $ season    : num [1:10886] 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : num [1:10886] 0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: num [1:10886] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : num [1:10886] 1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num [1:10886] 9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num [1:10886] 14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : num [1:10886] 81 80 80 75 75 75 80 86 75 76 ...
##  $ windspeed : num [1:10886] 0 0 0 0 0 ...
##  $ casual    : num [1:10886] 3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: num [1:10886] 13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : num [1:10886] 16 40 32 13 1 1 2 3 8 14 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   datetime = col_datetime(format = ""),
##   ..   season = col_double(),
##   ..   holiday = col_double(),
##   ..   workingday = col_double(),
##   ..   weather = col_double(),
##   ..   temp = col_double(),
##   ..   atemp = col_double(),
##   ..   humidity = col_double(),
##   ..   windspeed = col_double(),
##   ..   casual = col_double(),
##   ..   registered = col_double(),
##   ..   count = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

test <- read_csv("test.csv")

## Rows: 6493 Columns: 9

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (8): season, holiday, workingday, weather, temp, atemp, humidity, winds...
## dttm (1): datetime

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(test)

## spec_tbl_df [6,493 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ datetime  : POSIXct[1:6493], format: "2011-01-20 00:00:00" "2011-01-20 01:00:00" ...
##  $ season    : num [1:6493] 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : num [1:6493] 0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: num [1:6493] 1 1 1 1 1 1 1 1 1 1 ...
##  $ weather   : num [1:6493] 1 1 1 1 1 1 1 1 1 2 ...
##  $ temp      : num [1:6493] 10.7 10.7 10.7 10.7 10.7 ...
##  $ atemp     : num [1:6493] 11.4 13.6 13.6 12.9 12.9 ...
##  $ humidity  : num [1:6493] 56 56 56 56 56 60 60 55 55 52 ...
##  $ windspeed : num [1:6493] 26 0 0 11 11 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   datetime = col_datetime(format = ""),
##   ..   season = col_double(),
##   ..   holiday = col_double(),
##   ..   workingday = col_double(),
##   ..   weather = col_double(),
##   ..   temp = col_double(),
##   ..   atemp = col_double(),
##   ..   humidity = col_double(),
##   ..   windspeed = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data pre-processsing

# Convert season, holiday, workingday, and weather into factor
train$season <- as.factor(train$season)
test$season <- as.factor(test$season)
train$holiday <- as.factor(train$holiday)
test$holiday <- as.factor(test$holiday)
train$workingday <- as.factor(train$workingday)
test$workingday <- as.factor(test$workingday)
train$weather <- as.factor(train$weather)
test$weather <- as.factor(test$weather)
# Add a column for day of week in train.csv & test.csv dataset
train$day <- format(as.POSIXct(train$datetime), format = '%A')
test$day <- format(as.POSIXct(test$datetime), format = '%A')
aggregate(train[,"count"],list(train$day),mean)

##     Group.1    count
## 1    Friday 197.8443
## 2    Monday 190.3907
## 3  Saturday 196.6654
## 4    Sunday 180.8398
## 5  Thursday 197.2962
## 6   Tuesday 189.7238
## 7 Wednesday 188.4113

# Create Sunday variable
train$sunday <- ifelse(train$day == 'Sunday',1,0)
train$sunday <- as.factor(train$sunday)
test$sunday <- ifelse(test$day == 'Sunday',1,0)
test$sunday <- as.factor(test$sunday)
# Add a column for time in train.csv & test.csv dataset
train$time <- format(as.POSIXct(train$datetime), format = '%H')
train$time <- as.numeric(train$time)
train$time <- as.factor(train$time)
test$time <- format(as.POSIXct(test$datetime), format = '%H')
test$time <- as.numeric(test$time)
test$time <- as.factor(test$time)
# Add a column for temperature squared in train.csv & test.csv dataset
train$tempsq <- train$temp*train$temp
test$tempsq <- test$temp*test$temp
# Holdout-validation approach for evaluating model performance (70/30 split)
# Divide the train.csv dataset into two groups: t.test and t.train
# Stratified random sampling 
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(3456)
data_part <- createDataPartition(y=train$weather, times = 1, p = 0.7, list = F)

## Warning in createDataPartition(y = train$weather, times = 1, p = 0.7, list =
## F): Some classes have a single record ( 4 ) and these will be selected for the
## sample

t.train = train[data_part,] #Create the train dataset
dim(t.train)

## [1] 7622   16

t.test = train[-data_part,] #Create the test dataset
dim(t.test)

## [1] 3264   16

A. MULTIPLE LINEAR REGRESSION

# Linear regression model
factor <- (count~season+workingday+weather+temp+tempsq+humidity+time+sunday)
reg <- lm(factor,data=t.train)
summary(reg)

## 
## Call:
## lm(formula = factor, data = t.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -372.89  -62.29   -9.77   51.22  500.26 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.885e+01  1.129e+01  -3.442 0.000581 ***
## season2      3.423e+01  4.718e+00   7.255 4.43e-13 ***
## season3      1.681e+01  5.972e+00   2.815 0.004891 ** 
## season4      6.460e+01  3.960e+00  16.312  < 2e-16 ***
## workingday1 -8.090e+00  3.380e+00  -2.394 0.016696 *  
## weather2    -6.584e+00  3.125e+00  -2.107 0.035137 *  
## weather3    -6.591e+01  5.244e+00 -12.568  < 2e-16 ***
## weather4    -1.234e+02  1.108e+02  -1.114 0.265457    
## temp         7.323e+00  9.456e-01   7.745 1.08e-14 ***
## tempsq      -9.873e-03  2.309e-02  -0.428 0.668903    
## humidity    -8.468e-01  8.782e-02  -9.642  < 2e-16 ***
## time1       -1.100e+01  8.809e+00  -1.249 0.211795    
## time2       -1.776e+01  8.873e+00  -2.002 0.045304 *  
## time3       -3.654e+01  8.987e+00  -4.066 4.83e-05 ***
## time4       -3.465e+01  8.869e+00  -3.907 9.44e-05 ***
## time5       -1.873e+01  8.987e+00  -2.084 0.037149 *  
## time6        3.782e+01  8.850e+00   4.274 1.95e-05 ***
## time7        1.769e+02  8.825e+00  20.042  < 2e-16 ***
## time8        3.097e+02  8.840e+00  35.029  < 2e-16 ***
## time9        1.624e+02  8.797e+00  18.461  < 2e-16 ***
## time10       1.095e+02  8.852e+00  12.364  < 2e-16 ***
## time11       1.303e+02  8.894e+00  14.647  < 2e-16 ***
## time12       1.746e+02  8.996e+00  19.404  < 2e-16 ***
## time13       1.646e+02  9.039e+00  18.208  < 2e-16 ***
## time14       1.439e+02  9.147e+00  15.731  < 2e-16 ***
## time15       1.547e+02  9.107e+00  16.989  < 2e-16 ***
## time16       2.153e+02  9.100e+00  23.656  < 2e-16 ***
## time17       3.870e+02  9.051e+00  42.752  < 2e-16 ***
## time18       3.477e+02  8.957e+00  38.823  < 2e-16 ***
## time19       2.355e+02  8.890e+00  26.490  < 2e-16 ***
## time20       1.587e+02  8.833e+00  17.967  < 2e-16 ***
## time21       1.066e+02  8.812e+00  12.096  < 2e-16 ***
## time22       7.240e+01  8.762e+00   8.262  < 2e-16 ***
## time23       3.554e+01  8.771e+00   4.051 5.14e-05 ***
## sunday1     -2.398e+01  4.474e+00  -5.361 8.51e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 110.5 on 7587 degrees of freedom
## Multiple R-squared:  0.6323, Adjusted R-squared:  0.6307 
## F-statistic: 383.8 on 34 and 7587 DF,  p-value: < 2.2e-16

# Create the evaluation metrics function
eval_results <- function(true, predicted, df) {
                SSE <- sum((predicted - true)^2)
                SST <- sum((true - mean(true))^2)
                R_square <- 1 - SSE / SST
                RMSE = sqrt(SSE/nrow(df))
                data.frame(
                RMSE = RMSE,
                Rsquare = R_square)
}
# Predicting and evaluating the model on the train subset of the train.csv data
predictions1 = predict(reg, newdata = t.train)
eval_results(t.train$count, predictions1, t.train)

##       RMSE  Rsquare
## 1 110.2144 0.632346

# Predicting and evaluating the model on test subset of the train.csv data
predictions2 = predict(reg, newdata = t.test)
eval_results(t.test$count,predictions2, t.test)

##       RMSE   Rsquare
## 1 109.6384 0.6275456

B. Lasso

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-2

Mx<- model.matrix(count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday, data=t.train)
My<- t.train$count
Mx_test <- model.matrix(count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday, data=t.test)
My_test<-t.test$count
lambdas <- 10^seq(2, -3, by = -.1)
# Setting alpha = 1 implements lasso regression
lasso_reg <- cv.glmnet(Mx,My, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)
# Best lambda
lambda_best <- lasso_reg$lambda.min 
lambda_best

## [1] 0.001

lasso_model <- glmnet(Mx,My, alpha = 1, lambda = lambda_best, standardize = TRUE)
predictions3 <- predict(lasso_model, s = lambda_best, newx = Mx)
eval_results(t.train$count,predictions3,t.train)

##       RMSE   Rsquare
## 1 110.0277 0.6335902

predictions4 <- predict(lasso_model, s = lambda_best, newx = Mx_test)
eval_results(t.test$count, predictions4, t.test)

##       RMSE   Rsquare
## 1 109.3803 0.6292975

C. CART

library(rpart)
# Fit the model on the t.train set
set.seed(123)
cart <- train(
  count~season+holiday+workingday+weather+temp+tempsq+atemp+humidity+windspeed+time+sunday,
  data=t.train, method = "rpart",
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
  )

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.

# Print the best tuning parameter cp that
# minimize the model RMSE
cart$bestTune

##           cp
## 1 0.01095693

# Decision rules in the model
cart$finalModel

## n= 7622 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 7622 251829300.0 191.33880  
##    2) atemp< 30.6825 5597 133027100.0 150.26750  
##      4) time8< 0.5 5339 109286600.0 140.92040  
##        8) time17< 0.5 5151  90862450.0 132.38250  
##         16) time18< 0.5 4954  74771260.0 124.00790  
##           32) temp< 12.71 1348   7734817.0  70.37908 *
##           33) temp>=12.71 3606  61710260.0 144.05550  
##             66) humidity>=65.5 2069  28621250.0 111.70710 *
##             67) humidity< 65.5 1537  28009550.0 187.60050 *
##         17) time18>=0.5 197   7006558.0 342.97970 *
##        9) time17>=0.5 188   7760670.0 374.85110 *
##      5) time8>=0.5 258  13621280.0 343.69380  
##       10) workingday1< 0.5 89    832546.8 113.37080 *
##       11) workingday1>=0.5 169   5581014.0 464.98820 *
##    3) atemp>=30.6825 2025  83265560.0 304.85780  
##      6) time17< 0.5 1896  64784810.0 283.05700  
##       12) time18< 0.5 1768  49226750.0 262.24660  
##         24) humidity>=55.5 829  21365930.0 206.50660 *
##         25) humidity< 55.5 939  23011240.0 311.45690  
##           50) workingday1>=0.5 663  10961380.0 262.71950 *
##           51) workingday1< 0.5 276   6691951.0 428.53260 *
##       13) time18>=0.5 128   4216578.0 570.50000 *
##      7) time17>=0.5 129   4335218.0 625.27910 *

# Make predictions on the test data
predictions5 <- cart %>% predict(t.train)
eval_results(t.train$count,predictions5,t.train)

##       RMSE   Rsquare
## 1 132.1548 0.4713979

predictions6 <- cart %>% predict(t.test)
eval_results(t.test$count,predictions6,t.test)

##       RMSE   Rsquare
## 1 134.3513 0.4407173

D. RANDOM FOREST

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

# Using random forest to rank variables according to their importance
rf <- randomForest(count~season+holiday+workingday+weather+temp+tempsq+humidity+windspeed+time+sunday,data=t.train,importance = TRUE)
imp <- importance(rf, type=1)
featureImportance <- data.frame(Feature=row.names(imp), Importance=imp[,1])
# Graph variables based on its importance
ggplot(featureImportance, aes(x=reorder(Feature, Importance), y=Importance)) +
  geom_bar(stat="identity", fill="#53cfff") +
  coord_flip() + 
  theme_light(base_size=20) +
  xlab("Importance") +
  ylab("") + 
  ggtitle("Random Forest Feature Importance\n") +
  theme(plot.title=element_text(size=18))

# Evaluate random forest
predictions7 <- predict(rf, newdata = t.train)
eval_results(t.train$count,predictions7,t.train)

##       RMSE  Rsquare
## 1 42.00505 0.946597

predictions8 <- predict(rf, newdata = t.test)
eval_results(t.test$count, predictions8,t.test)

##       RMSE   Rsquare
## 1 72.98046 0.8349708

Use random forest for prediction

# Predict count on test.csv using random forest
finalpredictions <- round(predict(rf, newdata = test))
result <- data.frame(datetime = test$datetime, count=finalpredictions)

Append train.csv and test.csv into one dataset so we have the complete data

test$count <- finalpredictions
complete <- dplyr::bind_rows(train, test)

Visualization

Visualize bike demand according to seasons

ggplot()+ aes(x=factor(complete$season),y=complete$count)+geom_bar(stat="identity",fill="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma) + scale_x_discrete(name="Seasons",labels = c("Spring", "Summer", "Fall", "Winter")) + ggtitle("Bike Demand based on Seasons")

Visualize bike demand according to hours

ggplot()+ aes(x=factor(complete$time),y=complete$count)+geom_bar(stat="identity",color="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma)+xlab("Time")+ggtitle("Bike Demand based on Hours")

Visualize bike demand according to weather patterns

ggplot()+ aes(x=factor(complete$weather),y=complete$count)+geom_bar(stat="identity",color="blue")+scale_y_continuous(name="Total bike count",labels = scales::comma)+scale_x_discrete(name="Weather patterns",labels = c("Clear, partly cloudy","Mist, cloudy","Light snow, light rain","Heavy rain, snow"))+ggtitle("Bike Demand based on Weather patterns")

Visualize bike demand according to temp

ggplot(data = complete, mapping = aes(x = temp, y = count, color = temp)) +
geom_point(alpha = 1) + scale_colour_gradientn(colors=rainbow(4), trans="reverse") + xlab("Temperature") + ylab ("Count of Bikes") + ggtitle("Bike Demand based on Temp")

Visualize bike demand according to types of users

ggplot(train, aes(x=time, y=count,)) + 
  geom_boxplot() + xlab("hour") + ylab("count") + ggtitle("Hourly Bike Demand for Total Users")

ggplot(train, aes(x=time, y=casual)) + 
  geom_boxplot() + xlab("hour") + ylab("casual users")+ ggtitle("Hourly Bike Demand for Casual Users")

ggplot(train, aes(x=time, y=registered)) + 
  geom_boxplot() + xlab("hour") + ylab("registered users")+ggtitle("Hourly Bike Demand for Registered Users")

Predicting bike-share demand for Capital Bikeshare in Washington, D.C.

Load necessary packages into library

Data pre-processsing

A. MULTIPLE LINEAR REGRESSION

B. Lasso

C. CART

D. RANDOM FOREST

Use random forest for prediction

Append train.csv and test.csv into one dataset so we have the complete data

Visualization

Visualize bike demand according to seasons

Visualize bike demand according to hours

Visualize bike demand according to weather patterns

Visualize bike demand according to temp

Visualize bike demand according to types of users