library(ggplot2)
library(ggcorrplot)
library(e1071)
library(xgboost)

# Adjust plot size and suppress warnings globally
options(repr.plot.width = 12, repr.plot.height = 8)


            
              data <- read.csv(file = 'data.csv')
head(data)


            
              tail(data)


            
              print(paste("Number of records: ", nrow(data)))
print(paste("Number of features: ", ncol(data)))

[1] "Number of records:  4600"
[1] "Number of features:  18"


            
              summary(data)

     date               price             bedrooms       bathrooms    
 Length:4600        Min.   :       0   Min.   :0.000   Min.   :0.000  
 Class :character   1st Qu.:  322875   1st Qu.:3.000   1st Qu.:1.750  
 Mode  :character   Median :  460943   Median :3.000   Median :2.250  
                    Mean   :  551963   Mean   :3.401   Mean   :2.161  
                    3rd Qu.:  654962   3rd Qu.:4.000   3rd Qu.:2.500  
                    Max.   :26590000   Max.   :9.000   Max.   :8.000  
  sqft_living       sqft_lot           floors        waterfront      
 Min.   :  370   Min.   :    638   Min.   :1.000   Min.   :0.000000  
 1st Qu.: 1460   1st Qu.:   5001   1st Qu.:1.000   1st Qu.:0.000000  
 Median : 1980   Median :   7683   Median :1.500   Median :0.000000  
 Mean   : 2139   Mean   :  14852   Mean   :1.512   Mean   :0.007174  
 3rd Qu.: 2620   3rd Qu.:  11001   3rd Qu.:2.000   3rd Qu.:0.000000  
 Max.   :13540   Max.   :1074218   Max.   :3.500   Max.   :1.000000  
      view          condition       sqft_above   sqft_basement   
 Min.   :0.0000   Min.   :1.000   Min.   : 370   Min.   :   0.0  
 1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:1190   1st Qu.:   0.0  
 Median :0.0000   Median :3.000   Median :1590   Median :   0.0  
 Mean   :0.2407   Mean   :3.452   Mean   :1827   Mean   : 312.1  
 3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.:2300   3rd Qu.: 610.0  
 Max.   :4.0000   Max.   :5.000   Max.   :9410   Max.   :4820.0  
    yr_built     yr_renovated       street              city          
 Min.   :1900   Min.   :   0.0   Length:4600        Length:4600       
 1st Qu.:1951   1st Qu.:   0.0   Class :character   Class :character  
 Median :1976   Median :   0.0   Mode  :character   Mode  :character  
 Mean   :1971   Mean   : 808.6                                        
 3rd Qu.:1997   3rd Qu.:1999.0                                        
 Max.   :2014   Max.   :2014.0                                        
   statezip           country         
 Length:4600        Length:4600       
 Class :character   Class :character  
 Mode  :character   Mode  :character


            
              colnames(data) # columns names


            
              unique(data$city) # Unique cities


            
              maindf <- data[,c("price","bedrooms","sqft_living","floors",
                  "sqft_lot", "condition", "view", "yr_built")]
head(maindf)


            
              sum(is.na(maindf))


            
              maindf$oldbuilt <- as.integer(format(Sys.Date(), "%Y")) - maindf$yr_built

drops <- c("yr_built")
maindf = maindf[ , !(names(maindf) %in% drops)]


            
              cor(maindf)


            
              corr <- round(cor(maindf), 1)

# Plot
ggcorrplot(corr,
           type = "lower",
           lab = TRUE, 
           lab_size = 5,  
           colors = c("tomato2", "white", "springgreen3"),
           title="Correlogram of Housing Dataset", 
           ggtheme=theme_bw)


            
              pairs(~bedrooms + sqft_living + floors + condition, data = maindf,
       main = "Scatterplot Matrix")


            
              par(mfrow=c(2, 3))  # divide graph area in 2 columns
boxplot(maindf$bedrooms, main="Bedrooms")
boxplot(maindf$sqft_living, main="sqft_living")
boxplot(maindf$floors, main="floors")
boxplot(maindf$condition, main="condition")
boxplot(maindf$view, main="view")
boxplot(maindf$oldbuilt, main="oldbuilt")


            
              # Scatterplot
theme_set(theme_bw()) 
g <- ggplot(maindf, aes(bedrooms, floors))
g + geom_count(col="tomato3", show.legend=F) +
  labs(y="floors", 
       x="bedrooms", 
       title="Bedrooms vs Floors")


            
              plot(x = maindf$sqft_living, y = maindf$sqft_lot,
     xlab = "sqft_living",
     ylab = "sqft_lot",
     xlim = c(0, 3000), 
     ylim = c(0, 20000),
     main = "sqft_living vs sqft_lot"
)


            
              par(mfrow=c(2, 3)) 

plot(density(maindf$bedrooms), main="Density Plot: Bedrooms", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$bedrooms), 2)))  
polygon(density(maindf$bedrooms), col="green")

plot(density(maindf$sqft_living), main="Density Plot: sqft_living", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$sqft_living), 2)))  
polygon(density(maindf$sqft_living), col="orange")

plot(density(maindf$sqft_lot), main="Density Plot: sqft_lot", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$sqft_lot), 2)))  
polygon(density(maindf$sqft_lot), col="green")

plot(density(maindf$condition), main="Density Plot: condition", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$condition), 2)))  
polygon(density(maindf$condition), col="orange")

plot(density(maindf$floors), main="Density Plot: floors", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$floors), 2)))  
polygon(density(maindf$floors), col="green")

plot(density(maindf$oldbuilt), main="Density Plot: oldbuilt", ylab="Frequency",
     sub=paste("Skewness:", round(e1071::skewness(maindf$oldbuilt), 2)))  
polygon(density(maindf$oldbuilt), col="orange")


            
              # Convert the data to a matrix format required by XGBoost
X <- as.matrix(maindf$sqft_living)
y <- maindf$price

# Split data into training and testing (optional)
train_indices <- sample(1:nrow(maindf), 0.8 * nrow(maindf)) # 80% training data
X_train <- X[train_indices, , drop = FALSE]
y_train <- y[train_indices]
X_test <- X[-train_indices, , drop = FALSE]
y_test <- y[-train_indices]

# Train XGBoost model
dtrain <- xgb.DMatrix(data = X_train, label = y_train)
params <- list(objective = "reg:squarederror")  # for regression
xgb_model <- xgb.train(params = params, data = dtrain, nrounds = 100)

# Predict using the model
maindf$predictions <- predict(xgb_model, xgb.DMatrix(data = X))


            
              ggplot(maindf, aes(y = price, x = sqft_living)) +
  geom_point() +  # Scatter plot of actual data points
  xlim(0, 9000) +
  ylim(0, 5000000) +
  geom_line(aes(y = predictions, x = sqft_living), color = "blue", linetype = "dashed")  # Predicted values


            
              # Example: New data point for sqft_living
new_sqft_living <- matrix(3000, ncol = 1)  # Example: 3000 square feet

# Convert to DMatrix for prediction
dnew <- xgb.DMatrix(data = new_sqft_living)
# Predict using the trained model
predicted_price <- predict(xgb_model, dnew)

# Print the predicted price
print(paste("Predicted price for 3000 sqft living is:", round(predicted_price, 2)))

[1] "Predicted price for 3000 sqft living is: 791705.94"

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	sqft_above	sqft_basement	yr_built	yr_renovated	street	city	statezip	country
	<chr>	<dbl>	<dbl>	<dbl>	<int>	<int>	<dbl>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<chr>	<chr>	<chr>	<chr>
1	2014-05-02 00:00:00	313000	3	1.50	1340	7912	1.5	0	0	3	1340	0	1955	2005	18810 Densmore Ave N	Shoreline	WA 98133	USA
2	2014-05-02 00:00:00	2384000	5	2.50	3650	9050	2.0	0	4	5	3370	280	1921	0	709 W Blaine St	Seattle	WA 98119	USA
3	2014-05-02 00:00:00	342000	3	2.00	1930	11947	1.0	0	0	4	1930	0	1966	0	26206-26214 143rd Ave SE	Kent	WA 98042	USA
4	2014-05-02 00:00:00	420000	3	2.25	2000	8030	1.0	0	0	4	1000	1000	1963	0	857 170th Pl NE	Bellevue	WA 98008	USA
5	2014-05-02 00:00:00	550000	4	2.50	1940	10500	1.0	0	0	4	1140	800	1976	1992	9105 170th Ave NE	Redmond	WA 98052	USA
6	2014-05-02 00:00:00	490000	2	1.00	880	6380	1.0	0	0	3	880	0	1938	1994	522 NE 88th St	Seattle	WA 98115	USA

	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	sqft_above	sqft_basement	yr_built	yr_renovated	street	city	statezip	country
	<chr>	<dbl>	<dbl>	<dbl>	<int>	<int>	<dbl>	<int>	<int>	<int>	<int>	<int>	<int>	<int>	<chr>	<chr>	<chr>	<chr>
4595	2014-07-09 00:00:00	210614.3	3	2.50	1610	7223	2	0	0	3	1610	0	1994	0	26306 127th Ave SE	Kent	WA 98030	USA
4596	2014-07-09 00:00:00	308166.7	3	1.75	1510	6360	1	0	0	4	1510	0	1954	1979	501 N 143rd St	Seattle	WA 98133	USA
4597	2014-07-09 00:00:00	534333.3	3	2.50	1460	7573	2	0	0	3	1460	0	1983	2009	14855 SE 10th Pl	Bellevue	WA 98007	USA
4598	2014-07-09 00:00:00	416904.2	3	2.50	3010	7014	2	0	0	3	3010	0	2009	0	759 Ilwaco Pl NE	Renton	WA 98059	USA
4599	2014-07-10 00:00:00	203400.0	4	2.00	2090	6630	1	0	0	3	1070	1020	1974	0	5148 S Creston St	Seattle	WA 98178	USA
4600	2014-07-10 00:00:00	220600.0	3	2.50	1490	8102	2	0	0	4	1490	0	1990	0	18717 SE 258th St	Covington	WA 98042	USA

	price	bedrooms	sqft_living	floors	sqft_lot	condition	view	yr_built
	<dbl>	<dbl>	<int>	<dbl>	<int>	<int>	<int>	<int>
1	313000	3	1340	1.5	7912	3	0	1955
2	2384000	5	3650	2.0	9050	5	4	1921
3	342000	3	1930	1.0	11947	4	0	1966
4	420000	3	2000	1.0	8030	4	0	1963
5	550000	4	1940	1.0	10500	4	0	1976
6	490000	2	880	1.0	6380	3	0	1938

	price	bedrooms	sqft_living	floors	sqft_lot	condition	view	oldbuilt
price	1.00000000	0.20033629	0.43041003	0.15146080	0.050451295	0.034914537	0.22850417	-0.02185683
bedrooms	0.20033629	1.00000000	0.59488406	0.17789490	0.068819355	0.025079856	0.11102800	-0.14246104
sqft_living	0.43041003	0.59488406	1.00000000	0.34485027	0.210538454	-0.062825979	0.31100944	-0.28777522
floors	0.15146080	0.17789490	0.34485027	1.00000000	0.003749750	-0.275013395	0.03121095	-0.46748066
sqft_lot	0.05045130	0.06881935	0.21053845	0.00374975	1.000000000	0.000558114	0.07390674	-0.05070635
condition	0.03491454	0.02507986	-0.06282598	-0.27501339	0.000558114	1.000000000	0.06307728	0.39969823
view	0.22850417	0.11102800	0.31100944	0.03121095	0.073906741	0.063077281	1.00000000	0.06446506
oldbuilt	-0.02185683	-0.14246104	-0.28777522	-0.46748066	-0.050706346	0.399698234	0.06446506	1.00000000

Predic House Price - R programming - Ayoub MABROUK

Import libraries

Import dataset

Data exploration

Feature selection

Checking Null values

Figure out house age

Plot Correlation matrix

Plot scatterplot matrix

Plot boxplot for checking outliers

Plot scatterplots

Plot density plot to check normality

Plot univariate linear regression between sqft_living and price