In this exploratory data analysis, data from NYC Department of Finance are used to examine the average sale price of properties sold in New York City in 2019. NYC Department of Finance has been collecting annual sales records since 2003, with detailed information such as building type, square footage, address, price, etc. Additionally, data from SimplyAnalytics & US Census Bureau American Community Survey are used to analyze the 2020 median household income in each NYC borough.
I will focus only on single-family home properties sold in 2019 and answer the following questions:
# Property Sales data
# Create an empty data frame
sale <- data.frame()
# Combine sales data from the five boroughs
locations <- c('statenisland', 'bronx', 'manhattan', 'queens', 'brooklyn')
for (l in locations) {
single_l <- read_excel(paste('./data/nyc-properties/2019_', l ,'.xlsx', sep=''), skip = 4)
sale <- rbind(sale, single_l)
}
# Delete the extra white spaces and punctuation marks in the column names
names(sale) <- make.names(names(sale), unique = TRUE)
names(sale) <- gsub('..', "", names(sale), fixed=TRUE)
# Limit the sales data to single-family home sales only
singlehome <- sale %>%
select(BOROUGH, NEIGHBORHOOD, BUILDING.CLASS.CATEGORY, ADDRESS, ZIP.CODE,
GROSS.SQUARE.FEET, YEAR.BUILT, SALE.PRICE, SALE.DATE) %>%
filter(str_detect(BUILDING.CLASS.CATEGORY, '01\\s*ONE FAMILY.*')) %>%
filter(!is.na(SALE.PRICE)) %>%
transform(SALE.PRICE = as.numeric(SALE.PRICE), ZIP.CODE = as.character(ZIP.CODE)) %>%
mutate(BOROUGH.NAME = case_when(BOROUGH == '1' ~ 'Manhattan',
BOROUGH == '2' ~ 'Bronx',
BOROUGH == '3' ~ 'Brooklyn',
BOROUGH == '4' ~ 'Queens',
BOROUGH == '5' ~ 'Staten Island',))
head(singlehome)
## BOROUGH NEIGHBORHOOD BUILDING.CLASS.CATEGORY ADDRESS ZIP.CODE
## 1 5 ANNADALE 01 ONE FAMILY DWELLINGS 4716 AMBOY ROAD 10312
## 2 5 ANNADALE 01 ONE FAMILY DWELLINGS 21 FINGAL STREET 10312
## 3 5 ANNADALE 01 ONE FAMILY DWELLINGS 525 SYCAMORE STREET 10312
## 4 5 ANNADALE 01 ONE FAMILY DWELLINGS 1468 ARDEN AVENUE 10312
## 5 5 ANNADALE 01 ONE FAMILY DWELLINGS 2 SANDBORN STREET 10312
## 6 5 ANNADALE 01 ONE FAMILY DWELLINGS 247 KOCH BOULEVARD 10312
## GROSS.SQUARE.FEET YEAR.BUILT SALE.PRICE SALE.DATE BOROUGH.NAME
## 1 910 2002 0 2019-07-10 Staten Island
## 2 3540 1985 1650000 2019-03-14 Staten Island
## 3 2848 1980 775000 2019-12-13 Staten Island
## 4 2200 1940 685000 2019-04-15 Staten Island
## 5 880 1950 0 2019-01-24 Staten Island
## 6 1528 1970 0 2019-08-07 Staten Island
# Median Household Income Data
# Load the income data
income <- read_csv('./data/ny-median-household-income.csv')
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Name = col_character(),
## `Geographic Unit` = col_character(),
## `Median Household Income, 2020 [Estimated]` = col_double()
## )
# Format the column names
names(income) <- make.names(names(income), unique = TRUE)
names(income) <- gsub('..', ".", names(income), fixed=TRUE)
# Add zip code column to income data
income <- income %>%
mutate(ZIP.CODE = sapply(strsplit(income$Name, "\\,"), function(x) x[1])) %>%
filter(!is.na('Median.Household.Income.2020.Estimated.')) %>%
filter(ZIP.CODE != 'New York')
# To align the borough names ...
# Select zip code and borough name columns from sales data
sale.borough <- singlehome %>%
group_by(BOROUGH.NAME, ZIP.CODE) %>%
summarize(total = n()) %>%
select(BOROUGH.NAME, ZIP.CODE)
## `summarise()` has grouped output by 'BOROUGH.NAME'. You can override using the `.groups` argument.
# Merge the two data by zip code so the income data will have the same borough names as the sales data
# Also, I manually collected zip codes in Manhattan areas to fix the missing values
manhattan.zip <- c(10026, 10027, 10030, 10037, 10039, 10001, 10011, 10018, 10019, 10020, 10036, 10029, 10035, 10010, 10016, 10017, 10022, 10004, 10005, 10006, 10007, 10038, 10280, 10002, 10003, 10009, 10021, 10028, 10044, 10065, 10075, 10128, 10023, 10024, 10025, 10031, 10032, 10033, 10034, 10040)
bronx.zip <- c()
income <- merge(income, sale.borough, by = 'ZIP.CODE', all.x = TRUE) %>%
mutate(BOROUGH.NAME = case_when(ZIP.CODE %in% sapply(manhattan.zip,
function(x) as.character(x)) ~ 'Manhattan',
ZIP.CODE == '10474' ~ 'Bronx',
ZIP.CODE == '11005' ~ 'Queens',
ZIP.CODE == '11359' ~ 'Queens',
TRUE ~ BOROUGH.NAME))
head(income)
## ZIP.CODE Name Geographic.Unit
## 1 10001 10001, New York, NY Zip Code
## 2 10002 10002, New York, NY Zip Code
## 3 10003 10003, New York, NY Zip Code
## 4 10004 10004, New York, NY Zip Code
## 5 10005 10005, New York, NY Zip Code
## 6 10006 10006, New York, NY Zip Code
## Median.Household.Income.2020.Estimated. BOROUGH.NAME
## 1 88726.77 Manhattan
## 2 36528.70 Manhattan
## 3 111096.97 Manhattan
## 4 154770.50 Manhattan
## 5 172478.01 Manhattan
## 6 170669.84 Manhattan
In this section, I am going to perform some elementary analyses on the single-home sales data that result from the above cleaning.
# Define a function to calculate some basic statistics and examine the normality
cal_statistics <- function(df, col){
value <- as.numeric(df[[col]])
print(paste('The mean is:', as.character(mean(value, na.rm = TRUE))))
print(paste('The median is:', as.character(median(value, na.rm = TRUE))))
print(paste('The max is:', as.character(max(value, na.rm = TRUE))))
print(paste('The min is:', as.character(min(value, na.rm = TRUE))))
print(paste('skewness:', as.character(skewness(value, na.rm = TRUE))))
print(paste('kurtosis:', as.character(kurtosis(value, na.rm = TRUE))))
}
cal_statistics(singlehome, 'SALE.PRICE')
## [1] "The mean is: 484945.650018343"
## [1] "The median is: 440000"
## [1] "The max is: 77100000"
## [1] "The min is: 0"
## [1] "skewness: 27.0006548955661"
## [1] "kurtosis: 1459.54564566821"
ggplot(data = singlehome, mapping = aes(x = SALE.PRICE)) +
geom_histogram(bins = 100, fill = 'orange') +
labs(x = 'Sale Price', y = 'Property Counts') +
ggtitle('Histogram of Sale Price') +
theme_minimal()
ggplot(data = singlehome, mapping = aes(x='', y = SALE.PRICE)) +
geom_boxplot() +
stat_summary(fun=mean,
geom="point",
shape=20,
size=5,
color='gray', fill="gray") +
labs(y = 'Sale Price', x = '') +
ggtitle('Box Plot of Sale Price') +
theme_minimal()+
theme(
legend.position="none",
plot.title = element_text(size=11)
)
Based on the above calculation and charts, although the possible range of sale price is between $0 and $77.1 million, most properties have a sale price below $20 million. Looking at the histogram, we can see that the shape of the distribution is highly skewed, with most data points concentrating in a few bins. I made a boxplot to better understand the distribution of the data and as expected, it’s full of outliers! The interquartile range is so narrow that we can hardly see it.
In the following chunk, I am going to filter further the data set using the z score. Because the data set is not normally distributed, there will be some inaccuracies when calculating its z score and percentiles.
# I set a threshold of $4,000,000 to filter the data set
filter.price <- 4000000
# Calculate the z score and percentiles of the threshold price
singlehome.mean <- mean(singlehome$SALE.PRICE)
singlehome.sd <- sd(singlehome$SALE.PRICE)
singlehome.z <- (filter.price - singlehome.mean) / singlehome.sd
pnorm(singlehome.z)
## [1] 0.9994141
The probability of observing a sale price of $4,000,000 or lower in my data set is 99.94%.
# Filter the data
belowe4M <- singlehome %>%
filter(SALE.PRICE <= filter.price & SALE.PRICE >= 10000)
cal_statistics(belowe4M, 'SALE.PRICE')
## [1] "The mean is: 660768.829799427"
## [1] "The median is: 580000"
## [1] "The max is: 4e+06"
## [1] "The min is: 10000"
## [1] "skewness: 2.85241169050267"
## [1] "kurtosis: 16.8136191802642"
ggplot(data = belowe4M, mapping = aes(x = SALE.PRICE)) +
geom_histogram(bins = 60, fill = 'orange') +
scale_x_continuous(labels=function(x) paste0("$", x/1000000, "M"))+
labs(x = 'Sale Price', y = 'Property Counts') +
ggtitle('Histogram of Sale Price Under $4,000,000') +
theme_minimal()
ggplot(data = belowe4M, mapping = aes(x='', y = SALE.PRICE)) +
geom_boxplot() +
stat_summary(fun=mean,
geom="point",
shape=20,
size=5,
color='gray', fill="gray") +
scale_y_continuous(labels=function(x) paste0("$", x/1000000, "M"))+
labs(y = 'Sale Price', x = '') +
ggtitle('Box Plot of Sale Price') +
theme_minimal()+
theme(
legend.position="none",
plot.title = element_text(size=11)
)
Of all the properties sold at a price above $10K and below $4 million, the mean sale price was $660,769 and the median sale price was $580,000. Furthermore, with the skewness and kurtosis being 2.85 and 16.81, we can conclude that it’s a skewed right distribution where the property prices highly concentrate below $1 million. We can see from the boxplot that there are numerous outliers in our data set, but the box representing the interquartile range is visible this time.
# Filter the data
over4M <- singlehome %>%
filter(SALE.PRICE >= filter.price)
cal_statistics(over4M, 'SALE.PRICE')
## [1] "The mean is: 9751675.92622951"
## [1] "The median is: 7200000"
## [1] "The max is: 77100000"
## [1] "The min is: 4e+06"
## [1] "skewness: 4.91784328262067"
## [1] "kurtosis: 38.517138919842"
ggplot(data = over4M, mapping = aes(x = SALE.PRICE)) +
geom_histogram(bins = 60, fill = 'orange') +
scale_x_continuous(labels=function(x) paste0("$", x/1000000, "M"))+
labs(x = 'Sale Price', y = 'Property Counts') +
ggtitle('Histogram of Sale Price Above $4,000,000') +
theme_minimal()
ggplot(data = over4M, mapping = aes(x='', y = SALE.PRICE)) +
geom_boxplot() +
stat_summary(fun=mean,
geom="point",
shape=20,
size=5,
color='gray', fill="gray") +
scale_y_continuous(labels=function(x) paste0("$", x/1000000, "M"))+
labs(y = 'Sale Price', x = '') +
ggtitle('Box Plot of Sale Price') +
theme_minimal()+
theme(
legend.position="none",
plot.title = element_text(size=11)
)
I made another histogram for properties with a sale price of greater than $4 million, resulting in a skewed right histogram whose mean and median are $9.75 million and $7.2 million. Although the distribution has fewer outliers, it has a much higher skewness, 4.92.
In this section, I am going to compare the average sale price of single-family homes in the five boroughs in NYC.
# Group the data by the five boroughs
# Calculate central tendency and test normality
borough.sale.summary <- belowe4M %>%
group_by(BOROUGH.NAME) %>%
summarize(mean_sale = mean(SALE.PRICE, na.rm=TRUE),
median_sale = median(SALE.PRICE, na.rm=TRUE),
sd_sale = sd(SALE.PRICE, na.rm=TRUE),
kurtosis_sale = kurtosis(SALE.PRICE, na.rm=TRUE),
skewness_sale = skewness(SALE.PRICE, na.rm=TRUE))
borough.sale.summary
## # A tibble: 5 x 6
## BOROUGH.NAME mean_sale median_sale sd_sale kurtosis_sale skewness_sale
## * <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bronx 517091. 480000 311011. 34.7 4.47
## 2 Brooklyn 878519. 725000 572774. 8.54 2.06
## 3 Manhattan 2054627. 1812500 1087196. 1.66 0.126
## 4 Queens 666672. 615000 324274. 10.7 1.84
## 5 Staten Island 549517. 530000 231369. 23.2 2.66
# Box plots
belowe4M %>%
group_by(BOROUGH.NAME) %>%
ggplot(aes(x=BOROUGH.NAME, y=SALE.PRICE)) +
geom_boxplot(fill='#2a9d8f') +
stat_summary(fun=mean,
geom="point",
shape=20,
size=5,
color='orange', fill="orange") +
theme_minimal() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Sale Price in Each Borough") +
labs( x = "Borough", y = 'Sale Price') +
scale_y_continuous(labels=function(x) paste0("$", x/1000000, "M"))
Records with a sale price below subtracting 1.5x interquartile range from the first quartile or above adding 1.5x interquartile range to the third quartile are considered outliers that differed from most observations in the data set.
Manhattan has the highest mean and median sale price, while Bronx has the lowest. In all boroughs, the mean prices are greater than the median prices and the skewness values are above 0. The result suggests that all five distributions are positively skewed while the distribution of Bronx skewed the most. Except for Manhattan, all the other boroughs have a few outliers that situate between $3 million and $4 million, higher than 75% of Manhattan’s observations.
Since Manhattan and Bronx differed in many ways, let’s focus on these two boroughs to gather more information.
Manhattan has the most extensive interquartile range and a kurtosis less than 3, which is the smallest of all. These values indicate that although Manhattan has the highest mean and median sale price, its distribution is the most spread out.
On the other hand, Bronx has the lowest mean and median sale price and the highest kurtosis, suggesting that the shape of its distribution is tall and narrow and most observations concentrate in a small range of values.
Based on the above statistics, we can tell that Manhattan properties were sold at a higher price on average. But to find out whether there is a statistically significant difference between the mean sale price in Manhattan and NYC, I am going to conduct a one-sample t-test.
# Define a function to filter the data for me
extract_stats <- function(df, borough, col){
df <- df %>%
filter(BOROUGH.NAME == borough) %>%
select(col)
return (df[[col]])
}
# Define a function to calculate the t score as well as t probability and conduct the t test
calculate_t <- function(grouped_dataset,
ungrouped_dataset,
borough,
mean_col,
sd_col,
selected_col,
t_test_option){
pop.mean <- mean(ungrouped_dataset[[selected_col]], na.rm = TRUE)
sample.mean <- extract_stats(grouped_dataset, borough, mean_col)
sample.sd <- extract_stats(grouped_dataset, borough, sd_col)
selected.borough.dataset <- filter(ungrouped_dataset, BOROUGH.NAME == borough)
sample.size <- nrow(selected.borough.dataset)
sample.se <- sample.sd / sqrt(sample.size)
t.score <- (sample.mean - pop.mean) / sample.se
t.prob <- pt(t.score, df = sample.size - 1, lower.tail = TRUE)
t.test.result <- t.test(x = selected.borough.dataset[[selected_col]],
mu = pop.mean,
alternative = t_test_option)
print(pop.mean)
print(borough)
print(sample.mean)
print(t.test.result)
}
calculate_t(borough.sale.summary,
belowe4M,
'Manhattan',
'mean_sale',
'sd_sale',
'SALE.PRICE', 'greater')
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(col)` instead of `col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## [1] 660768.8
## [1] "Manhattan"
## [1] 2054627
##
## One Sample t-test
##
## data: selected.borough.dataset[[selected_col]]
## t = 6.7841, df = 27, p-value = 1.379e-07
## alternative hypothesis: true mean is greater than 660768.8
## 95 percent confidence interval:
## 1704668 Inf
## sample estimates:
## mean of x
## 2054627
The p-value is 1.379e-07, much smaller than the threshold of 0.05. The result suggests that we accept the alternative hypothesis and acknowledge that the mean sale price of Manhattan’s properties is significantly greater than that of all NYC properties.
I am going to conduct the one-sample t-test on observations from Bronx properties, which has the lowest mean.
calculate_t(borough.sale.summary,
belowe4M,
'Bronx',
'mean_sale',
'sd_sale',
'SALE.PRICE',
'less')
## [1] 660768.8
## [1] "Bronx"
## [1] 517091.1
##
## One Sample t-test
##
## data: selected.borough.dataset[[selected_col]]
## t = -15.062, df = 1062, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 660768.8
## 95 percent confidence interval:
## -Inf 532795.2
## sample estimates:
## mean of x
## 517091.1
The p-value is 2.2e-16, much smaller than the threshold of 0.05. The result suggests that we accept the alternative hypothesis and acknowledge that the mean sale price of Bronx properties is significantly less than that of all NYC properties.
In this section, I am going to perform some elementary analyses on the medians of NYC household income by zip code that result from the data cleaning process in the first section.
cal_statistics(income, 'Median.Household.Income.2020.Estimated.')
## [1] "The mean is: 72046.9870347084"
## [1] "The median is: 65352.9277541825"
## [1] "The max is: 239390.2"
## [1] "The min is: 21959.056815471"
## [1] "skewness: 1.38541556598362"
## [1] "kurtosis: 6.33464198442706"
ggplot(data = income, mapping = aes(x = `Median.Household.Income.2020.Estimated.`)) +
geom_histogram(bins = 30, fill = 'orange') +
labs(x = 'Median Household Income', y = 'Counts') +
scale_x_continuous(labels=function(x) paste0("$", x/1000, "K"))+
ggtitle('Histogram of Median Household Income') +
theme_minimal()
## Warning: Removed 2 rows containing non-finite values (stat_bin).
The mean of the median household income in NYC is $72,047, greater than the median. The skewness and kurtosis are 1.4 and 6.3 separately. These numbers indicate that the distribution of the medians of NYC household income by zip code is positively skewed and leptokurtic, where most values center around the mean.
In the histogram above, there are a few bars located between $150K and $250K. We can further inspect whether these are the outliers in our data set.
# Group income data by the five boroughs
# Calculate central tendency and test normality
borough.income.summary <- income %>%
group_by(BOROUGH.NAME) %>%
summarize(mean_income = mean(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE),
median_income = median(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE),
sd_income = sd(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE),
kurtosis_income = kurtosis(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE),
skewness_income = skewness(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE))
borough.income.summary
## # A tibble: 5 x 6
## BOROUGH.NAME mean_income median_income sd_income kurtosis_income
## * <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Bronx 43779. 36270. 20232. 3.03
## 2 Brooklyn 60916. 55691. 24331. 3.91
## 3 Manhattan 99293. 108221. 46859. 3.14
## 4 Queens 70195. 70780. 14564. 2.98
## 5 Staten Isla… 76733. 77629. 16361. 2.04
## # … with 1 more variable: skewness_income <dbl>
# Plot
income %>%
group_by(BOROUGH.NAME) %>%
ggplot(aes(x=BOROUGH.NAME, y=`Median.Household.Income.2020.Estimated.`)) +
geom_boxplot(fill='#2a9d8f') +
stat_summary(fun=mean,
geom="point",
shape=20,
size=5,
color='orange', fill="orange") +
theme_minimal() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Median Household Income in Each Borough") +
labs( x = "Borough", y = 'Median Houhold Income') +
scale_y_continuous(labels=function(x) paste0("$", x/1000, "K"))
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_summary).
In terms of the median household income (referred to as income in the following paragraphs), Manhattan also has the highest mean and median values while Bronx has the lowest.
In Bronx and Brooklyn, the mean incomes are larger than the median incomes. Their skewness values are both greater than 1. These numbers suggest that the distributions of Bronx and Brooklyn’s observations are positively skewed and most observations situate in the lower end of the range of the distribution.
Mean incomes are slightly lower than median incomes in the other three boroughs; however, their skewness values are still positive. We can further inspect the data by plotting histograms.
income %>%
filter(BOROUGH.NAME == 'Manhattan') %>%
ggplot(aes(x=`Median.Household.Income.2020.Estimated.`)) +
geom_histogram(bins = 20, fill = 'orange') +
labs(x = 'Manhattan Median Household Income', y = 'Counts') +
scale_x_continuous(labels=function(x) paste0("$", x/1000, "K"))+
ggtitle('Histogram of Manhattan Median Household Income') +
theme_minimal()
## Warning: Removed 1 rows containing non-finite values (stat_bin).
income %>%
filter(BOROUGH.NAME == 'Queens') %>%
ggplot(aes(x=`Median.Household.Income.2020.Estimated.`)) +
geom_histogram(bins = 20, fill = 'orange') +
labs(x = 'Queens Median Household Income', y = 'Counts') +
scale_x_continuous(labels=function(x) paste0("$", x/1000, "K"))+
ggtitle('Histogram of Queens Median Household Income') +
theme_minimal()
## Warning: Removed 1 rows containing non-finite values (stat_bin).
income %>%
filter(BOROUGH.NAME == 'Staten Island') %>%
ggplot(aes(x=`Median.Household.Income.2020.Estimated.`)) +
geom_histogram(bins = 8, fill = 'orange') +
labs(x = 'Staten Island Median Household Income', y = 'Counts') +
scale_x_continuous(labels=function(x) paste0("$", x/1000, "K"))+
ggtitle('Histogram of Staten Island Median Household Income') +
theme_minimal()
With some further investigation, we can see that these distributions are not unimodal but bimodal. Take Manhattan’s distribution for example, there is the main mode value that situates at around $130K and a second lower mode value at around $48K. Although most of the values concentrate towards the higher end of the distribution, the second mode pulls the mean below the median.
Compared with all boroughs in NYC, Manhattan’s median household income is the highest value while that of Bronx is the lowest. But to find out whether there is a statistically significant difference, I am going to conduct a one-sample t-test.
calculate_t(borough.income.summary,
income,
'Manhattan',
'mean_income',
'sd_income',
'Median.Household.Income.2020.Estimated.',
'greater')
## [1] 72046.99
## [1] "Manhattan"
## [1] 99293.21
##
## One Sample t-test
##
## data: selected.borough.dataset[[selected_col]]
## t = 3.8129, df = 42, p-value = 0.0002217
## alternative hypothesis: true mean is greater than 72046.99
## 95 percent confidence interval:
## 87274.17 Inf
## sample estimates:
## mean of x
## 99293.21
The p-value is 0.0002217, smaller than the threshold of 0.05. The result suggests that we accept the alternative hypothesis and acknowledge that Manhattan’s mean median household income is significantly greater than that of NYC.
calculate_t(borough.income.summary, income, 'Bronx', 'mean_income', 'sd_income', 'Median.Household.Income.2020.Estimated.', 'less')
## [1] 72046.99
## [1] "Bronx"
## [1] 43779.24
##
## One Sample t-test
##
## data: selected.borough.dataset[[selected_col]]
## t = -6.9858, df = 24, p-value = 1.593e-07
## alternative hypothesis: true mean is less than 72046.99
## 95 percent confidence interval:
## -Inf 50702.29
## sample estimates:
## mean of x
## 43779.24
The p-value is 1.593e-07, much smaller than the threshold of 0.05. The result suggests that we accept the alternative hypothesis and acknowledge that Bronx’s mean median household income is significantly less than that of NYC.
I am merging the two data sets and inspecting whether there’s a significant correlation between the sale price and median household income.
# Group the sale data by zip code
zip.grouped.sale <- belowe4M %>%
group_by(ZIP.CODE) %>%
summarize(mean_sale = mean(SALE.PRICE), median_sale = median(SALE.PRICE))
# Merge it with the income data
zip.merge <- merge(income, zip.grouped.sale, by = 'ZIP.CODE', all = TRUE)
# Check if two variables are linearly related
# Notes: 'lm' equals linear model
ggplot(data = zip.merge, mapping = aes(x = mean_sale,
y = `Median.Household.Income.2020.Estimated.`)) +
geom_point() +
geom_smooth(method = 'lm') +
labs(x = 'Mean Sale Price', y = 'Median Income') +
ggtitle('Mean Sale Price and Median Income by ZIP') +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 34 rows containing non-finite values (stat_smooth).
## Warning: Removed 34 rows containing missing values (geom_point).
# Test normality
shapiro.test(zip.merge$`Median.Household.Income.2020.Estimated.`)
##
## Shapiro-Wilk normality test
##
## data: zip.merge$Median.Household.Income.2020.Estimated.
## W = 0.91036, p-value = 6.964e-09
shapiro.test(zip.merge$mean_sale)
##
## Shapiro-Wilk normality test
##
## data: zip.merge$mean_sale
## W = 0.73188, p-value = 3.111e-15
# if the variables are continuous, linearly related and normally distributed, we can move on
cor.test(x = zip.merge$`Median.Household.Income.2020.Estimated.`, y = zip.merge$mean_sale)
##
## Pearson's product-moment correlation
##
## data: zip.merge$Median.Household.Income.2020.Estimated. and zip.merge$mean_sale
## t = 6.2445, df = 145, p-value = 4.446e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3225020 0.5790934
## sample estimates:
## cor
## 0.4603595
The scatter plot suggests some sort of positive correlation between these two variables. And judging from the p-value that is much less than the assumed alpha level of 0.05, the correlation is statistically significant.
The results from the Shapiro-Wilk normality tests suggested that the two variables are not normally distributed and thus cast doubts on the reliability of the positive correlation. But the box plots and t-tests show that Manhattan and the Bronx represent two different sides of the NYC distribution in both income and sale price, providing additional evidence of a positive correlation.
To better understand how expensive it is to buy a single-family house in NYC, we can calculate the price-to-income ratio by dividing the median sale price by the median household income in an area. According to City Lab, people can afford a home equal to roughly 2.6 years of their household income, meaning that 2.6 years of household income would be enough to purchase a house.
zip.merge <- zip.merge %>%
mutate(priceIncomeRatio = zip.merge$median_sale / zip.merge$Median.Household.Income.2020.Estimated.)
cal_statistics(zip.merge, 'priceIncomeRatio')
## [1] "The mean is: 15.3226201659883"
## [1] "The median is: 13.0369681646047"
## [1] "The max is: 52.0548249764616"
## [1] "The min is: 4.01838077577956"
## [1] "skewness: 1.67589753710902"
## [1] "kurtosis: 6.27140802547186"
ggplot(data = zip.merge, mapping=aes(x=priceIncomeRatio)) +
geom_histogram(bins = 30, fill = 'orange') +
labs(x = 'price-to-income ratio', y = 'counts') +
ggtitle('NYC price-to-income ratio by zip') +
theme_minimal()
## Warning: Removed 34 rows containing non-finite values (stat_bin).
The mean of NYC’s price-to-income ratio is 15.32, indicating that over 15 years of household income was necessary to buy a single-family home, which was a lot higher than the healthy 2.6 price-to-income ratio suggested by the City Lab.
head(zip.merge[order(zip.merge$priceIncomeRatio),])
## ZIP.CODE Name Geographic.Unit
## 181 11697 11697, Breezy Point, NY Zip Code
## 70 10464 10464, Bronx, NY Zip Code
## 155 11411 11411, Cambria Heights, NY Zip Code
## 157 11413 11413, Springfield Gardens, NY Zip Code
## 166 11422 11422, Rosedale, NY Zip Code
## 51 10308 10308, Staten Island, NY Zip Code
## Median.Household.Income.2020.Estimated. BOROUGH.NAME mean_sale median_sale
## 181 107879.27 Queens 477219.6 433500
## 70 93400.07 Bronx 568069.5 446160
## 155 92719.36 Queens 463411.1 485000
## 157 85894.76 Queens 478181.5 481000
## 166 89171.11 Queens 493265.6 499495
## 51 94284.62 Staten Island 537191.9 531500
## priceIncomeRatio
## 181 4.018381
## 70 4.776870
## 155 5.230839
## 157 5.599876
## 166 5.601534
## 51 5.637187
The zip codes with the least price-income ratios were 11697 (Queens), 10464 (Bronx), 11411 (Queens), 11413 (Queens) and 11422 (Queens). The zip codes with the highest price-income ratios were 10009 (Manhattan), 10030 (Manhattan), 10027 (Manhattan), 10031 (Manhattan), 11205 (Brooklyn).
zip <- read.csv('./data/uszips.csv')
head(zip)
## zip lat lng city state_id state_name zcta parent_zcta
## 1 601 18.18005 -66.75218 Adjuntas PR Puerto Rico TRUE NA
## 2 602 18.36074 -67.17519 Aguada PR Puerto Rico TRUE NA
## 3 603 18.45440 -67.12201 Aguadilla PR Puerto Rico TRUE NA
## 4 606 18.16721 -66.93828 Maricao PR Puerto Rico TRUE NA
## 5 610 18.29032 -67.12244 Anasco PR Puerto Rico TRUE NA
## 6 612 18.40699 -66.70805 Arecibo PR Puerto Rico TRUE NA
## population density county_fips county_name
## 1 17113 102.7 72001 Adjuntas
## 2 37751 476.0 72003 Aguada
## 3 47081 574.9 72005 Aguadilla
## 4 6392 58.3 72093 Maricao
## 5 26686 286.9 72011 Añasco
## 6 59369 339.1 72013 Arecibo
## county_weights
## 1 {"72001": "99.43", "72141": "0.57"}
## 2 {"72003": "100"}
## 3 {"72005": "100"}
## 4 {"72093": "94.88", "72153": "3.78", "72121": "1.35"}
## 5 {"72011": "99.45", "72003": "0.55"}
## 6 {"72013": "99.89", "72017": "0.11"}
## county_names_all county_fips_all imprecise military
## 1 Adjuntas|Utuado 72001|72141 FALSE FALSE
## 2 Aguada 72003 FALSE FALSE
## 3 Aguadilla 72005 FALSE FALSE
## 4 Maricao|Yauco|Sabana Grande 72093|72153|72121 FALSE FALSE
## 5 Añasco|Aguada 72011|72003 FALSE FALSE
## 6 Arecibo|Barceloneta 72013|72017 FALSE FALSE
## timezone
## 1 America/Puerto_Rico
## 2 America/Puerto_Rico
## 3 America/Puerto_Rico
## 4 America/Puerto_Rico
## 5 America/Puerto_Rico
## 6 America/Puerto_Rico
final <- merge(zip.merge, zip, by.x = 'ZIP.CODE', by.y = 'zip', all.x = TRUE)
final.withoutNA <- final %>% filter(!is.na(lng), !is.na(lat))
# Setting existing coordinate as lat-long system and as in the right format
cord.dec <- SpatialPoints(cbind(final.withoutNA$lng, final.withoutNA$lat), proj4string = CRS("+proj=longlat"))
# Transforming coordinate to UTM using WGS=84, UTM Zone=18,
cord.utm <- spTransform(cord.dec, CRS("+proj=utm +zone=18T ellps=WGS84"))
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in Proj4
## definition
final.withoutNA$coordsx = as(cord.utm, "data.frame")$coords.x1
final.withoutNA$coordsy = as(cord.utm, "data.frame")$coords.x2
write.csv(final.withoutNA,"./data/zipWithLatLon_property_sales.csv", row.names = FALSE)
The above analyses leave several remaining questions open to investigation:
Further analysis will be required to answer these questions.