library(ggplot2)
library(ggcorrplot)
library(e1071)
library(xgboost)
# Adjust plot size and suppress warnings globally
options(repr.plot.width = 12, repr.plot.height = 8)
data <- read.csv(file = 'data.csv')
head(data)
date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | sqft_above | sqft_basement | yr_built | yr_renovated | street | city | statezip | country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <int> | <int> | <dbl> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | <chr> | |
1 | 2014-05-02 00:00:00 | 313000 | 3 | 1.50 | 1340 | 7912 | 1.5 | 0 | 0 | 3 | 1340 | 0 | 1955 | 2005 | 18810 Densmore Ave N | Shoreline | WA 98133 | USA |
2 | 2014-05-02 00:00:00 | 2384000 | 5 | 2.50 | 3650 | 9050 | 2.0 | 0 | 4 | 5 | 3370 | 280 | 1921 | 0 | 709 W Blaine St | Seattle | WA 98119 | USA |
3 | 2014-05-02 00:00:00 | 342000 | 3 | 2.00 | 1930 | 11947 | 1.0 | 0 | 0 | 4 | 1930 | 0 | 1966 | 0 | 26206-26214 143rd Ave SE | Kent | WA 98042 | USA |
4 | 2014-05-02 00:00:00 | 420000 | 3 | 2.25 | 2000 | 8030 | 1.0 | 0 | 0 | 4 | 1000 | 1000 | 1963 | 0 | 857 170th Pl NE | Bellevue | WA 98008 | USA |
5 | 2014-05-02 00:00:00 | 550000 | 4 | 2.50 | 1940 | 10500 | 1.0 | 0 | 0 | 4 | 1140 | 800 | 1976 | 1992 | 9105 170th Ave NE | Redmond | WA 98052 | USA |
6 | 2014-05-02 00:00:00 | 490000 | 2 | 1.00 | 880 | 6380 | 1.0 | 0 | 0 | 3 | 880 | 0 | 1938 | 1994 | 522 NE 88th St | Seattle | WA 98115 | USA |
tail(data)
date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | sqft_above | sqft_basement | yr_built | yr_renovated | street | city | statezip | country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <int> | <int> | <dbl> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | <chr> | |
4595 | 2014-07-09 00:00:00 | 210614.3 | 3 | 2.50 | 1610 | 7223 | 2 | 0 | 0 | 3 | 1610 | 0 | 1994 | 0 | 26306 127th Ave SE | Kent | WA 98030 | USA |
4596 | 2014-07-09 00:00:00 | 308166.7 | 3 | 1.75 | 1510 | 6360 | 1 | 0 | 0 | 4 | 1510 | 0 | 1954 | 1979 | 501 N 143rd St | Seattle | WA 98133 | USA |
4597 | 2014-07-09 00:00:00 | 534333.3 | 3 | 2.50 | 1460 | 7573 | 2 | 0 | 0 | 3 | 1460 | 0 | 1983 | 2009 | 14855 SE 10th Pl | Bellevue | WA 98007 | USA |
4598 | 2014-07-09 00:00:00 | 416904.2 | 3 | 2.50 | 3010 | 7014 | 2 | 0 | 0 | 3 | 3010 | 0 | 2009 | 0 | 759 Ilwaco Pl NE | Renton | WA 98059 | USA |
4599 | 2014-07-10 00:00:00 | 203400.0 | 4 | 2.00 | 2090 | 6630 | 1 | 0 | 0 | 3 | 1070 | 1020 | 1974 | 0 | 5148 S Creston St | Seattle | WA 98178 | USA |
4600 | 2014-07-10 00:00:00 | 220600.0 | 3 | 2.50 | 1490 | 8102 | 2 | 0 | 0 | 4 | 1490 | 0 | 1990 | 0 | 18717 SE 258th St | Covington | WA 98042 | USA |
print(paste("Number of records: ", nrow(data)))
print(paste("Number of features: ", ncol(data)))
[1] "Number of records: 4600" [1] "Number of features: 18"
summary(data)
date price bedrooms bathrooms Length:4600 Min. : 0 Min. :0.000 Min. :0.000 Class :character 1st Qu.: 322875 1st Qu.:3.000 1st Qu.:1.750 Mode :character Median : 460943 Median :3.000 Median :2.250 Mean : 551963 Mean :3.401 Mean :2.161 3rd Qu.: 654962 3rd Qu.:4.000 3rd Qu.:2.500 Max. :26590000 Max. :9.000 Max. :8.000 sqft_living sqft_lot floors waterfront Min. : 370 Min. : 638 Min. :1.000 Min. :0.000000 1st Qu.: 1460 1st Qu.: 5001 1st Qu.:1.000 1st Qu.:0.000000 Median : 1980 Median : 7683 Median :1.500 Median :0.000000 Mean : 2139 Mean : 14852 Mean :1.512 Mean :0.007174 3rd Qu.: 2620 3rd Qu.: 11001 3rd Qu.:2.000 3rd Qu.:0.000000 Max. :13540 Max. :1074218 Max. :3.500 Max. :1.000000 view condition sqft_above sqft_basement Min. :0.0000 Min. :1.000 Min. : 370 Min. : 0.0 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:1190 1st Qu.: 0.0 Median :0.0000 Median :3.000 Median :1590 Median : 0.0 Mean :0.2407 Mean :3.452 Mean :1827 Mean : 312.1 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.:2300 3rd Qu.: 610.0 Max. :4.0000 Max. :5.000 Max. :9410 Max. :4820.0 yr_built yr_renovated street city Min. :1900 Min. : 0.0 Length:4600 Length:4600 1st Qu.:1951 1st Qu.: 0.0 Class :character Class :character Median :1976 Median : 0.0 Mode :character Mode :character Mean :1971 Mean : 808.6 3rd Qu.:1997 3rd Qu.:1999.0 Max. :2014 Max. :2014.0 statezip country Length:4600 Length:4600 Class :character Class :character Mode :character Mode :character
colnames(data) # columns names
unique(data$city) # Unique cities
maindf <- data[,c("price","bedrooms","sqft_living","floors",
"sqft_lot", "condition", "view", "yr_built")]
head(maindf)
price | bedrooms | sqft_living | floors | sqft_lot | condition | view | yr_built | |
---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <int> | <dbl> | <int> | <int> | <int> | <int> | |
1 | 313000 | 3 | 1340 | 1.5 | 7912 | 3 | 0 | 1955 |
2 | 2384000 | 5 | 3650 | 2.0 | 9050 | 5 | 4 | 1921 |
3 | 342000 | 3 | 1930 | 1.0 | 11947 | 4 | 0 | 1966 |
4 | 420000 | 3 | 2000 | 1.0 | 8030 | 4 | 0 | 1963 |
5 | 550000 | 4 | 1940 | 1.0 | 10500 | 4 | 0 | 1976 |
6 | 490000 | 2 | 880 | 1.0 | 6380 | 3 | 0 | 1938 |
sum(is.na(maindf))
maindf$oldbuilt <- as.integer(format(Sys.Date(), "%Y")) - maindf$yr_built
drops <- c("yr_built")
maindf = maindf[ , !(names(maindf) %in% drops)]
cor(maindf)
price | bedrooms | sqft_living | floors | sqft_lot | condition | view | oldbuilt | |
---|---|---|---|---|---|---|---|---|
price | 1.00000000 | 0.20033629 | 0.43041003 | 0.15146080 | 0.050451295 | 0.034914537 | 0.22850417 | -0.02185683 |
bedrooms | 0.20033629 | 1.00000000 | 0.59488406 | 0.17789490 | 0.068819355 | 0.025079856 | 0.11102800 | -0.14246104 |
sqft_living | 0.43041003 | 0.59488406 | 1.00000000 | 0.34485027 | 0.210538454 | -0.062825979 | 0.31100944 | -0.28777522 |
floors | 0.15146080 | 0.17789490 | 0.34485027 | 1.00000000 | 0.003749750 | -0.275013395 | 0.03121095 | -0.46748066 |
sqft_lot | 0.05045130 | 0.06881935 | 0.21053845 | 0.00374975 | 1.000000000 | 0.000558114 | 0.07390674 | -0.05070635 |
condition | 0.03491454 | 0.02507986 | -0.06282598 | -0.27501339 | 0.000558114 | 1.000000000 | 0.06307728 | 0.39969823 |
view | 0.22850417 | 0.11102800 | 0.31100944 | 0.03121095 | 0.073906741 | 0.063077281 | 1.00000000 | 0.06446506 |
oldbuilt | -0.02185683 | -0.14246104 | -0.28777522 | -0.46748066 | -0.050706346 | 0.399698234 | 0.06446506 | 1.00000000 |
corr <- round(cor(maindf), 1)
# Plot
ggcorrplot(corr,
type = "lower",
lab = TRUE,
lab_size = 5,
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of Housing Dataset",
ggtheme=theme_bw)
pairs(~bedrooms + sqft_living + floors + condition, data = maindf,
main = "Scatterplot Matrix")
par(mfrow=c(2, 3)) # divide graph area in 2 columns
boxplot(maindf$bedrooms, main="Bedrooms")
boxplot(maindf$sqft_living, main="sqft_living")
boxplot(maindf$floors, main="floors")
boxplot(maindf$condition, main="condition")
boxplot(maindf$view, main="view")
boxplot(maindf$oldbuilt, main="oldbuilt")
# Scatterplot
theme_set(theme_bw())
g <- ggplot(maindf, aes(bedrooms, floors))
g + geom_count(col="tomato3", show.legend=F) +
labs(y="floors",
x="bedrooms",
title="Bedrooms vs Floors")
plot(x = maindf$sqft_living, y = maindf$sqft_lot,
xlab = "sqft_living",
ylab = "sqft_lot",
xlim = c(0, 3000),
ylim = c(0, 20000),
main = "sqft_living vs sqft_lot"
)
par(mfrow=c(2, 3))
plot(density(maindf$bedrooms), main="Density Plot: Bedrooms", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(maindf$bedrooms), 2)))
polygon(density(maindf$bedrooms), col="green")
plot(density(maindf$sqft_living), main="Density Plot: sqft_living", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(maindf$sqft_living), 2)))
polygon(density(maindf$sqft_living), col="orange")
plot(density(maindf$sqft_lot), main="Density Plot: sqft_lot", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(maindf$sqft_lot), 2)))
polygon(density(maindf$sqft_lot), col="green")
plot(density(maindf$condition), main="Density Plot: condition", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(maindf$condition), 2)))
polygon(density(maindf$condition), col="orange")
plot(density(maindf$floors), main="Density Plot: floors", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(maindf$floors), 2)))
polygon(density(maindf$floors), col="green")
plot(density(maindf$oldbuilt), main="Density Plot: oldbuilt", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(maindf$oldbuilt), 2)))
polygon(density(maindf$oldbuilt), col="orange")
# Convert the data to a matrix format required by XGBoost
X <- as.matrix(maindf$sqft_living)
y <- maindf$price
# Split data into training and testing (optional)
train_indices <- sample(1:nrow(maindf), 0.8 * nrow(maindf)) # 80% training data
X_train <- X[train_indices, , drop = FALSE]
y_train <- y[train_indices]
X_test <- X[-train_indices, , drop = FALSE]
y_test <- y[-train_indices]
# Train XGBoost model
dtrain <- xgb.DMatrix(data = X_train, label = y_train)
params <- list(objective = "reg:squarederror") # for regression
xgb_model <- xgb.train(params = params, data = dtrain, nrounds = 100)
# Predict using the model
maindf$predictions <- predict(xgb_model, xgb.DMatrix(data = X))
ggplot(maindf, aes(y = price, x = sqft_living)) +
geom_point() + # Scatter plot of actual data points
xlim(0, 9000) +
ylim(0, 5000000) +
geom_line(aes(y = predictions, x = sqft_living), color = "blue", linetype = "dashed") # Predicted values
# Example: New data point for sqft_living
new_sqft_living <- matrix(3000, ncol = 1) # Example: 3000 square feet
# Convert to DMatrix for prediction
dnew <- xgb.DMatrix(data = new_sqft_living)
# Predict using the trained model
predicted_price <- predict(xgb_model, dnew)
# Print the predicted price
print(paste("Predicted price for 3000 sqft living is:", round(predicted_price, 2)))
[1] "Predicted price for 3000 sqft living is: 791705.94"