Predic House Price - R programming - Ayoub MABROUK

Import libraries

In [1]:
library(ggplot2)
library(ggcorrplot)
library(e1071)
library(xgboost)

# Adjust plot size and suppress warnings globally
options(repr.plot.width = 12, repr.plot.height = 8)

Import dataset

In [2]:
data <- read.csv(file = 'data.csv')
head(data)
A data.frame: 6 × 18
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition sqft_above sqft_basement yr_built yr_renovated street city statezip country
<chr> <dbl> <dbl> <dbl> <int> <int> <dbl> <int> <int> <int> <int> <int> <int> <int> <chr> <chr> <chr> <chr>
1 2014-05-02 00:00:00 313000 3 1.50 1340 7912 1.5 0 0 3 1340 0 1955 2005 18810 Densmore Ave N Shoreline WA 98133 USA
2 2014-05-02 00:00:00 2384000 5 2.50 3650 9050 2.0 0 4 5 3370 280 1921 0 709 W Blaine St Seattle WA 98119 USA
3 2014-05-02 00:00:00 342000 3 2.00 1930 11947 1.0 0 0 4 1930 0 1966 0 26206-26214 143rd Ave SE Kent WA 98042 USA
4 2014-05-02 00:00:00 420000 3 2.25 2000 8030 1.0 0 0 4 1000 1000 1963 0 857 170th Pl NE Bellevue WA 98008 USA
5 2014-05-02 00:00:00 550000 4 2.50 1940 10500 1.0 0 0 4 1140 800 1976 1992 9105 170th Ave NE Redmond WA 98052 USA
6 2014-05-02 00:00:00 490000 2 1.00 880 6380 1.0 0 0 3 880 0 1938 1994 522 NE 88th St Seattle WA 98115 USA

Data exploration

In [3]:
tail(data)
A data.frame: 6 × 18
date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition sqft_above sqft_basement yr_built yr_renovated street city statezip country
<chr> <dbl> <dbl> <dbl> <int> <int> <dbl> <int> <int> <int> <int> <int> <int> <int> <chr> <chr> <chr> <chr>
4595 2014-07-09 00:00:00 210614.3 3 2.50 1610 7223 2 0 0 3 1610 0 1994 0 26306 127th Ave SE Kent WA 98030 USA
4596 2014-07-09 00:00:00 308166.7 3 1.75 1510 6360 1 0 0 4 1510 0 1954 1979 501 N 143rd St Seattle WA 98133 USA
4597 2014-07-09 00:00:00 534333.3 3 2.50 1460 7573 2 0 0 3 1460 0 1983 2009 14855 SE 10th Pl Bellevue WA 98007 USA
4598 2014-07-09 00:00:00 416904.2 3 2.50 3010 7014 2 0 0 3 3010 0 2009 0 759 Ilwaco Pl NE Renton WA 98059 USA
4599 2014-07-10 00:00:00 203400.0 4 2.00 2090 6630 1 0 0 3 1070 1020 1974 0 5148 S Creston St Seattle WA 98178 USA
4600 2014-07-10 00:00:00 220600.0 3 2.50 1490 8102 2 0 0 4 1490 0 1990 0 18717 SE 258th St Covington WA 98042 USA
In [4]:
print(paste("Number of records: ", nrow(data)))
print(paste("Number of features: ", ncol(data)))
[1] "Number of records:  4600"
[1] "Number of features:  18"
In [5]:
summary(data)
     date               price             bedrooms       bathrooms    
 Length:4600        Min.   :       0   Min.   :0.000   Min.   :0.000  
 Class :character   1st Qu.:  322875   1st Qu.:3.000   1st Qu.:1.750  
 Mode  :character   Median :  460943   Median :3.000   Median :2.250  
                    Mean   :  551963   Mean   :3.401   Mean   :2.161  
                    3rd Qu.:  654962   3rd Qu.:4.000   3rd Qu.:2.500  
                    Max.   :26590000   Max.   :9.000   Max.   :8.000  
  sqft_living       sqft_lot           floors        waterfront      
 Min.   :  370   Min.   :    638   Min.   :1.000   Min.   :0.000000  
 1st Qu.: 1460   1st Qu.:   5001   1st Qu.:1.000   1st Qu.:0.000000  
 Median : 1980   Median :   7683   Median :1.500   Median :0.000000  
 Mean   : 2139   Mean   :  14852   Mean   :1.512   Mean   :0.007174  
 3rd Qu.: 2620   3rd Qu.:  11001   3rd Qu.:2.000   3rd Qu.:0.000000  
 Max.   :13540   Max.   :1074218   Max.   :3.500   Max.   :1.000000  
      view          condition       sqft_above   sqft_basement   
 Min.   :0.0000   Min.   :1.000   Min.   : 370   Min.   :   0.0  
 1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:1190   1st Qu.:   0.0  
 Median :0.0000   Median :3.000   Median :1590   Median :   0.0  
 Mean   :0.2407   Mean   :3.452   Mean   :1827   Mean   : 312.1  
 3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.:2300   3rd Qu.: 610.0  
 Max.   :4.0000   Max.   :5.000   Max.   :9410   Max.   :4820.0  
    yr_built     yr_renovated       street              city          
 Min.   :1900   Min.   :   0.0   Length:4600        Length:4600       
 1st Qu.:1951   1st Qu.:   0.0   Class :character   Class :character  
 Median :1976   Median :   0.0   Mode  :character   Mode  :character  
 Mean   :1971   Mean   : 808.6                                        
 3rd Qu.:1997   3rd Qu.:1999.0                                        
 Max.   :2014   Max.   :2014.0                                        
   statezip           country         
 Length:4600        Length:4600       
 Class :character   Class :character  
 Mode  :character   Mode  :character  
                                      
                                      
                                      
In [6]:
colnames(data) # columns names
  1. 'date'
  2. 'price'
  3. 'bedrooms'
  4. 'bathrooms'
  5. 'sqft_living'
  6. 'sqft_lot'
  7. 'floors'
  8. 'waterfront'
  9. 'view'
  10. 'condition'
  11. 'sqft_above'
  12. 'sqft_basement'
  13. 'yr_built'
  14. 'yr_renovated'
  15. 'street'
  16. 'city'
  17. 'statezip'
  18. 'country'
In [7]:
unique(data$city) # Unique cities
  1. 'Shoreline'
  2. 'Seattle'
  3. 'Kent'
  4. 'Bellevue'
  5. 'Redmond'
  6. 'Maple Valley'
  7. 'North Bend'
  8. 'Lake Forest Park'
  9. 'Sammamish'
  10. 'Auburn'
  11. 'Des Moines'
  12. 'Bothell'
  13. 'Federal Way'
  14. 'Kirkland'
  15. 'Issaquah'
  16. 'Woodinville'
  17. 'Normandy Park'
  18. 'Fall City'
  19. 'Renton'
  20. 'Carnation'
  21. 'Snoqualmie'
  22. 'Duvall'
  23. 'Burien'
  24. 'Covington'
  25. 'Inglewood-Finn Hill'
  26. 'Kenmore'
  27. 'Newcastle'
  28. 'Mercer Island'
  29. 'Black Diamond'
  30. 'Ravensdale'
  31. 'Clyde Hill'
  32. 'Algona'
  33. 'Skykomish'
  34. 'Tukwila'
  35. 'Vashon'
  36. 'Yarrow Point'
  37. 'SeaTac'
  38. 'Medina'
  39. 'Enumclaw'
  40. 'Snoqualmie Pass'
  41. 'Pacific'
  42. 'Beaux Arts Village'
  43. 'Preston'
  44. 'Milton'

Feature selection

In [8]:
maindf <- data[,c("price","bedrooms","sqft_living","floors",
                  "sqft_lot", "condition", "view", "yr_built")]
head(maindf)
A data.frame: 6 × 8
price bedrooms sqft_living floors sqft_lot condition view yr_built
<dbl> <dbl> <int> <dbl> <int> <int> <int> <int>
1 313000 3 1340 1.5 7912 3 0 1955
2 2384000 5 3650 2.0 9050 5 4 1921
3 342000 3 1930 1.0 11947 4 0 1966
4 420000 3 2000 1.0 8030 4 0 1963
5 550000 4 1940 1.0 10500 4 0 1976
6 490000 2 880 1.0 6380 3 0 1938

Checking Null values

In [9]:
sum(is.na(maindf))
0

Figure out house age

In [10]:
maindf$oldbuilt <- as.integer(format(Sys.Date(), "%Y")) - maindf$yr_built

drops <- c("yr_built")
maindf = maindf[ , !(names(maindf) %in% drops)]

Plot Correlation matrix

In [11]:
cor(maindf)
A matrix: 8 × 8 of type dbl
price bedrooms sqft_living floors sqft_lot condition view oldbuilt
price 1.00000000 0.20033629 0.43041003 0.15146080 0.050451295 0.034914537 0.22850417 -0.02185683
bedrooms 0.20033629 1.00000000 0.59488406 0.17789490 0.068819355 0.025079856 0.11102800 -0.14246104
sqft_living 0.43041003 0.59488406 1.00000000 0.34485027 0.210538454 -0.062825979 0.31100944 -0.28777522
floors 0.15146080 0.17789490 0.34485027 1.00000000 0.003749750 -0.275013395 0.03121095 -0.46748066
sqft_lot 0.05045130 0.06881935 0.21053845 0.00374975 1.000000000 0.000558114 0.07390674 -0.05070635
condition 0.03491454 0.02507986 -0.06282598 -0.27501339 0.000558114 1.000000000 0.06307728 0.39969823
view 0.22850417 0.11102800 0.31100944 0.03121095 0.073906741 0.063077281 1.00000000 0.06446506
oldbuilt -0.02185683 -0.14246104 -0.28777522 -0.46748066 -0.050706346 0.399698234 0.06446506 1.00000000
In [12]:
corr <- round(cor(maindf), 1)

# Plot
ggcorrplot(corr,
           type = "lower",
           lab = TRUE, 
           lab_size = 5,  
           colors = c("tomato2", "white", "springgreen3"),
           title="Correlogram of Housing Dataset", 
           ggtheme=theme_bw)

Plot scatterplot matrix

In [13]:
pairs(~bedrooms + sqft_living + floors + condition, data = maindf,
       main = "Scatterplot Matrix")

Plot boxplot for checking outliers

In [14]:
par(mfrow=c(2, 3))  # divide graph area in 2 columns
boxplot(maindf$bedrooms, main="Bedrooms")
boxplot(maindf$sqft_living, main="sqft_living")
boxplot(maindf$floors, main="floors")
boxplot(maindf$condition, main="condition")
boxplot(maindf$view, main="view")
boxplot(maindf$oldbuilt, main="oldbuilt")

Plot scatterplots

In [15]:
# Scatterplot
theme_set(theme_bw()) 
g <- ggplot(maindf, aes(bedrooms, floors))
g + geom_count(col="tomato3", show.legend=F) +
  labs(y="floors", 
       x="bedrooms", 
       title="Bedrooms vs Floors")
In [16]:
plot(x = maindf$sqft_living, y = maindf$sqft_lot,
     xlab = "sqft_living",
     ylab = "sqft_lot",
     xlim = c(0, 3000), 
     ylim = c(0, 20000),
     main = "sqft_living vs sqft_lot"
)

Plot density plot to check normality

In [17]:
par(mfrow=c(2, 3)) 

plot(density(maindf$bedrooms), main="Density Plot: Bedrooms", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$bedrooms), 2)))  
polygon(density(maindf$bedrooms), col="green")

plot(density(maindf$sqft_living), main="Density Plot: sqft_living", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$sqft_living), 2)))  
polygon(density(maindf$sqft_living), col="orange")

plot(density(maindf$sqft_lot), main="Density Plot: sqft_lot", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$sqft_lot), 2)))  
polygon(density(maindf$sqft_lot), col="green")

plot(density(maindf$condition), main="Density Plot: condition", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$condition), 2)))  
polygon(density(maindf$condition), col="orange")

plot(density(maindf$floors), main="Density Plot: floors", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$floors), 2)))  
polygon(density(maindf$floors), col="green")

plot(density(maindf$oldbuilt), main="Density Plot: oldbuilt", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$oldbuilt), 2)))  
polygon(density(maindf$oldbuilt), col="orange")

Plot univariate linear regression between sqft_living and price

In [18]:
# Convert the data to a matrix format required by XGBoost
X <- as.matrix(maindf$sqft_living)
y <- maindf$price

# Split data into training and testing (optional)
train_indices <- sample(1:nrow(maindf), 0.8 * nrow(maindf)) # 80% training data
X_train <- X[train_indices, , drop = FALSE]
y_train <- y[train_indices]
X_test <- X[-train_indices, , drop = FALSE]
y_test <- y[-train_indices]

# Train XGBoost model
dtrain <- xgb.DMatrix(data = X_train, label = y_train)
params <- list(objective = "reg:squarederror")  # for regression
xgb_model <- xgb.train(params = params, data = dtrain, nrounds = 100)

# Predict using the model
maindf$predictions <- predict(xgb_model, xgb.DMatrix(data = X))
In [19]:
ggplot(maindf, aes(y = price, x = sqft_living)) +
  geom_point() +  # Scatter plot of actual data points
  xlim(0, 9000) +
  ylim(0, 5000000) +
  geom_line(aes(y = predictions, x = sqft_living), color = "blue", linetype = "dashed")  # Predicted values
In [20]:
# Example: New data point for sqft_living
new_sqft_living <- matrix(3000, ncol = 1)  # Example: 3000 square feet

# Convert to DMatrix for prediction
dnew <- xgb.DMatrix(data = new_sqft_living)
# Predict using the trained model
predicted_price <- predict(xgb_model, dnew)

# Print the predicted price
print(paste("Predicted price for 3000 sqft living is:", round(predicted_price, 2)))
[1] "Predicted price for 3000 sqft living is: 791705.94"