Introducing the data

In this exploratory data analysis, data from NYC Department of Finance are used to examine the average sale price of properties sold in New York City in 2019. NYC Department of Finance has been collecting annual sales records since 2003, with detailed information such as building type, square footage, address, price, etc. Additionally, data from SimplyAnalytics & US Census Bureau American Community Survey are used to analyze the 2020 median household income in each NYC borough.

I will focus only on single-family home properties sold in 2019 and answer the following questions:

# Property Sales data
# Create an empty data frame
sale <- data.frame()

# Combine sales data from the five boroughs
locations <- c('statenisland', 'bronx', 'manhattan', 'queens', 'brooklyn')
for (l in locations) {
  single_l <- read_excel(paste('./data/nyc-properties/2019_', l ,'.xlsx', sep=''), skip = 4)
  sale <- rbind(sale, single_l)
}

# Delete the extra white spaces and punctuation marks in the column names
names(sale) <- make.names(names(sale), unique = TRUE)  
names(sale) <- gsub('..', "", names(sale), fixed=TRUE)

# Limit the sales data to single-family home sales only
singlehome <- sale %>% 
  select(BOROUGH, NEIGHBORHOOD, BUILDING.CLASS.CATEGORY, ADDRESS, ZIP.CODE, 
         GROSS.SQUARE.FEET, YEAR.BUILT, SALE.PRICE, SALE.DATE) %>%
  filter(str_detect(BUILDING.CLASS.CATEGORY, '01\\s*ONE FAMILY.*')) %>% 
  filter(!is.na(SALE.PRICE)) %>%
  transform(SALE.PRICE = as.numeric(SALE.PRICE), ZIP.CODE = as.character(ZIP.CODE)) %>%
  mutate(BOROUGH.NAME = case_when(BOROUGH == '1' ~ 'Manhattan',
                                  BOROUGH == '2' ~ 'Bronx',
                                  BOROUGH == '3' ~ 'Brooklyn',
                                  BOROUGH == '4' ~ 'Queens',
                                  BOROUGH == '5' ~ 'Staten Island',))
head(singlehome)
##   BOROUGH NEIGHBORHOOD BUILDING.CLASS.CATEGORY             ADDRESS ZIP.CODE
## 1       5     ANNADALE 01 ONE FAMILY DWELLINGS     4716 AMBOY ROAD    10312
## 2       5     ANNADALE 01 ONE FAMILY DWELLINGS    21 FINGAL STREET    10312
## 3       5     ANNADALE 01 ONE FAMILY DWELLINGS 525 SYCAMORE STREET    10312
## 4       5     ANNADALE 01 ONE FAMILY DWELLINGS   1468 ARDEN AVENUE    10312
## 5       5     ANNADALE 01 ONE FAMILY DWELLINGS   2 SANDBORN STREET    10312
## 6       5     ANNADALE 01 ONE FAMILY DWELLINGS  247 KOCH BOULEVARD    10312
##   GROSS.SQUARE.FEET YEAR.BUILT SALE.PRICE  SALE.DATE  BOROUGH.NAME
## 1               910       2002          0 2019-07-10 Staten Island
## 2              3540       1985    1650000 2019-03-14 Staten Island
## 3              2848       1980     775000 2019-12-13 Staten Island
## 4              2200       1940     685000 2019-04-15 Staten Island
## 5               880       1950          0 2019-01-24 Staten Island
## 6              1528       1970          0 2019-08-07 Staten Island
# Median Household Income Data
# Load the income data
income <- read_csv('./data/ny-median-household-income.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Name = col_character(),
##   `Geographic Unit` = col_character(),
##   `Median Household Income, 2020 [Estimated]` = col_double()
## )
# Format the column names
names(income) <- make.names(names(income), unique = TRUE)  
names(income) <- gsub('..', ".", names(income), fixed=TRUE)

# Add zip code column to income data
income <- income %>% 
  mutate(ZIP.CODE = sapply(strsplit(income$Name, "\\,"), function(x) x[1]))  %>%
  filter(!is.na('Median.Household.Income.2020.Estimated.')) %>% 
  filter(ZIP.CODE != 'New York')

# To align the borough names ...
# Select zip code and borough name columns from sales data
sale.borough <- singlehome %>% 
  group_by(BOROUGH.NAME, ZIP.CODE) %>% 
  summarize(total = n()) %>%
  select(BOROUGH.NAME, ZIP.CODE)
## `summarise()` has grouped output by 'BOROUGH.NAME'. You can override using the `.groups` argument.
# Merge the two data by zip code so the income data will have the same borough names as the sales data
# Also, I manually collected zip codes in Manhattan areas to fix the missing values
manhattan.zip <- c(10026, 10027, 10030, 10037, 10039, 10001, 10011, 10018, 10019, 10020, 10036, 10029, 10035, 10010, 10016, 10017, 10022, 10004, 10005, 10006, 10007, 10038, 10280, 10002, 10003, 10009, 10021, 10028, 10044, 10065, 10075, 10128, 10023, 10024, 10025, 10031, 10032, 10033, 10034, 10040)
bronx.zip <- c()
income <- merge(income, sale.borough, by = 'ZIP.CODE', all.x = TRUE) %>% 
  mutate(BOROUGH.NAME = case_when(ZIP.CODE %in% sapply(manhattan.zip, 
                                                       function(x) as.character(x)) ~ 'Manhattan',
                                  ZIP.CODE == '10474' ~ 'Bronx',
                                  ZIP.CODE == '11005' ~ 'Queens',
                                  ZIP.CODE == '11359' ~ 'Queens',
                                  TRUE ~ BOROUGH.NAME))

head(income)
##   ZIP.CODE                Name Geographic.Unit
## 1    10001 10001, New York, NY        Zip Code
## 2    10002 10002, New York, NY        Zip Code
## 3    10003 10003, New York, NY        Zip Code
## 4    10004 10004, New York, NY        Zip Code
## 5    10005 10005, New York, NY        Zip Code
## 6    10006 10006, New York, NY        Zip Code
##   Median.Household.Income.2020.Estimated. BOROUGH.NAME
## 1                                88726.77    Manhattan
## 2                                36528.70    Manhattan
## 3                               111096.97    Manhattan
## 4                               154770.50    Manhattan
## 5                               172478.01    Manhattan
## 6                               170669.84    Manhattan

What is the average single-family home sale price in NYC?

In this section, I am going to perform some elementary analyses on the single-home sales data that result from the above cleaning.

# Define a function to calculate some basic statistics and examine the normality
cal_statistics <- function(df, col){
  value <- as.numeric(df[[col]])
  print(paste('The mean is:', as.character(mean(value, na.rm = TRUE))))
  print(paste('The median is:', as.character(median(value, na.rm = TRUE))))
  print(paste('The max is:', as.character(max(value, na.rm = TRUE))))
  print(paste('The min is:', as.character(min(value, na.rm = TRUE))))
  print(paste('skewness:', as.character(skewness(value, na.rm = TRUE))))
  print(paste('kurtosis:', as.character(kurtosis(value, na.rm = TRUE))))
}

cal_statistics(singlehome, 'SALE.PRICE')
## [1] "The mean is: 484945.650018343"
## [1] "The median is: 440000"
## [1] "The max is: 77100000"
## [1] "The min is: 0"
## [1] "skewness: 27.0006548955661"
## [1] "kurtosis: 1459.54564566821"
ggplot(data = singlehome, mapping = aes(x = SALE.PRICE)) +
  geom_histogram(bins = 100, fill = 'orange') +
  labs(x = 'Sale Price', y = 'Property Counts') +
  ggtitle('Histogram of Sale Price') +
  theme_minimal()

ggplot(data = singlehome, mapping = aes(x='', y = SALE.PRICE)) +
  geom_boxplot() +
  stat_summary(fun=mean, 
                 geom="point", 
                 shape=20, 
                 size=5, 
                 color='gray', fill="gray") +
  labs(y = 'Sale Price', x = '') +
  ggtitle('Box Plot of Sale Price') +
  theme_minimal()+
  theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) 

Based on the above calculation and charts, although the possible range of sale price is between $0 and $77.1 million, most properties have a sale price below $20 million. Looking at the histogram, we can see that the shape of the distribution is highly skewed, with most data points concentrating in a few bins. I made a boxplot to better understand the distribution of the data and as expected, it’s full of outliers! The interquartile range is so narrow that we can hardly see it.

In the following chunk, I am going to filter further the data set using the z score. Because the data set is not normally distributed, there will be some inaccuracies when calculating its z score and percentiles.

# I set a threshold of $4,000,000 to filter the data set
filter.price <- 4000000

# Calculate the z score and percentiles of the threshold price
singlehome.mean <- mean(singlehome$SALE.PRICE)
singlehome.sd <- sd(singlehome$SALE.PRICE)
singlehome.z <- (filter.price - singlehome.mean) / singlehome.sd
pnorm(singlehome.z)
## [1] 0.9994141

The probability of observing a sale price of $4,000,000 or lower in my data set is 99.94%.

# Filter the data
belowe4M <- singlehome %>% 
  filter(SALE.PRICE <= filter.price & SALE.PRICE >= 10000)

cal_statistics(belowe4M, 'SALE.PRICE')
## [1] "The mean is: 660768.829799427"
## [1] "The median is: 580000"
## [1] "The max is: 4e+06"
## [1] "The min is: 10000"
## [1] "skewness: 2.85241169050267"
## [1] "kurtosis: 16.8136191802642"
ggplot(data = belowe4M, mapping = aes(x = SALE.PRICE)) +
  geom_histogram(bins = 60, fill = 'orange') +
  scale_x_continuous(labels=function(x) paste0("$", x/1000000, "M"))+
  labs(x = 'Sale Price', y = 'Property Counts') +
   ggtitle('Histogram of Sale Price Under $4,000,000') +
  theme_minimal()

ggplot(data = belowe4M, mapping = aes(x='', y = SALE.PRICE)) +
  geom_boxplot() +
  stat_summary(fun=mean, 
                 geom="point", 
                 shape=20, 
                 size=5, 
                 color='gray', fill="gray") +
  scale_y_continuous(labels=function(x) paste0("$", x/1000000, "M"))+
  labs(y = 'Sale Price', x = '') +
  ggtitle('Box Plot of Sale Price') +
  theme_minimal()+
  theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) 

Of all the properties sold at a price above $10K and below $4 million, the mean sale price was $660,769 and the median sale price was $580,000. Furthermore, with the skewness and kurtosis being 2.85 and 16.81, we can conclude that it’s a skewed right distribution where the property prices highly concentrate below $1 million. We can see from the boxplot that there are numerous outliers in our data set, but the box representing the interquartile range is visible this time.

# Filter the data
over4M <- singlehome %>% 
  filter(SALE.PRICE >= filter.price)

cal_statistics(over4M, 'SALE.PRICE')
## [1] "The mean is: 9751675.92622951"
## [1] "The median is: 7200000"
## [1] "The max is: 77100000"
## [1] "The min is: 4e+06"
## [1] "skewness: 4.91784328262067"
## [1] "kurtosis: 38.517138919842"
ggplot(data = over4M, mapping = aes(x = SALE.PRICE)) +
  geom_histogram(bins = 60, fill = 'orange') +
  scale_x_continuous(labels=function(x) paste0("$", x/1000000, "M"))+
  labs(x = 'Sale Price', y = 'Property Counts') +
   ggtitle('Histogram of Sale Price Above $4,000,000') +
  theme_minimal()

ggplot(data = over4M, mapping = aes(x='', y = SALE.PRICE)) +
  geom_boxplot() +
  stat_summary(fun=mean, 
                 geom="point", 
                 shape=20, 
                 size=5, 
                 color='gray', fill="gray") +
  scale_y_continuous(labels=function(x) paste0("$", x/1000000, "M"))+
  labs(y = 'Sale Price', x = '') +
  ggtitle('Box Plot of Sale Price') +
  theme_minimal()+
  theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) 

I made another histogram for properties with a sale price of greater than $4 million, resulting in a skewed right histogram whose mean and median are $9.75 million and $7.2 million. Although the distribution has fewer outliers, it has a much higher skewness, 4.92.

What is the average single-family home sale price in each borough?

In this section, I am going to compare the average sale price of single-family homes in the five boroughs in NYC.

# Group the data by the five boroughs 
# Calculate central tendency and test normality
borough.sale.summary <- belowe4M %>% 
  group_by(BOROUGH.NAME) %>%
  summarize(mean_sale = mean(SALE.PRICE, na.rm=TRUE), 
            median_sale = median(SALE.PRICE, na.rm=TRUE), 
            sd_sale = sd(SALE.PRICE, na.rm=TRUE), 
            kurtosis_sale = kurtosis(SALE.PRICE, na.rm=TRUE), 
            skewness_sale = skewness(SALE.PRICE, na.rm=TRUE))

borough.sale.summary 
## # A tibble: 5 x 6
##   BOROUGH.NAME  mean_sale median_sale  sd_sale kurtosis_sale skewness_sale
## * <chr>             <dbl>       <dbl>    <dbl>         <dbl>         <dbl>
## 1 Bronx           517091.      480000  311011.         34.7          4.47 
## 2 Brooklyn        878519.      725000  572774.          8.54         2.06 
## 3 Manhattan      2054627.     1812500 1087196.          1.66         0.126
## 4 Queens          666672.      615000  324274.         10.7          1.84 
## 5 Staten Island   549517.      530000  231369.         23.2          2.66
# Box plots 
belowe4M %>% 
  group_by(BOROUGH.NAME) %>%
  ggplot(aes(x=BOROUGH.NAME, y=SALE.PRICE)) +
    geom_boxplot(fill='#2a9d8f') +
    stat_summary(fun=mean, 
                 geom="point", 
                 shape=20, 
                 size=5, 
                 color='orange', fill="orange") +
    theme_minimal() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    ggtitle("Sale Price in Each Borough") +
    labs( x = "Borough", y = 'Sale Price') +
    scale_y_continuous(labels=function(x) paste0("$", x/1000000, "M"))

Records with a sale price below subtracting 1.5x interquartile range from the first quartile or above adding 1.5x interquartile range to the third quartile are considered outliers that differed from most observations in the data set.

Manhattan has the highest mean and median sale price, while Bronx has the lowest. In all boroughs, the mean prices are greater than the median prices and the skewness values are above 0. The result suggests that all five distributions are positively skewed while the distribution of Bronx skewed the most. Except for Manhattan, all the other boroughs have a few outliers that situate between $3 million and $4 million, higher than 75% of Manhattan’s observations.

Since Manhattan and Bronx differed in many ways, let’s focus on these two boroughs to gather more information.

Manhattan has the most extensive interquartile range and a kurtosis less than 3, which is the smallest of all. These values indicate that although Manhattan has the highest mean and median sale price, its distribution is the most spread out.

On the other hand, Bronx has the lowest mean and median sale price and the highest kurtosis, suggesting that the shape of its distribution is tall and narrow and most observations concentrate in a small range of values.

Is the average sale price of Manhattan’s properties significantly higher than that of NYC as a whole? Is the average sale price of Bronx’s properties significantly lower than that of NYC?

Based on the above statistics, we can tell that Manhattan properties were sold at a higher price on average. But to find out whether there is a statistically significant difference between the mean sale price in Manhattan and NYC, I am going to conduct a one-sample t-test.

# Define a function to filter the data for me
extract_stats <- function(df, borough, col){
  df <- df %>% 
    filter(BOROUGH.NAME == borough) %>%
    select(col)
  return (df[[col]])
}

# Define a function to calculate the t score as well as t probability and conduct the t test
calculate_t <- function(grouped_dataset, 
                        ungrouped_dataset,
                        borough,
                        mean_col,
                        sd_col,
                        selected_col,
                        t_test_option){
  pop.mean <- mean(ungrouped_dataset[[selected_col]], na.rm = TRUE)
  sample.mean <- extract_stats(grouped_dataset, borough, mean_col)
  sample.sd <- extract_stats(grouped_dataset, borough, sd_col)
  selected.borough.dataset <- filter(ungrouped_dataset, BOROUGH.NAME == borough)
  sample.size <- nrow(selected.borough.dataset)
  sample.se <- sample.sd / sqrt(sample.size)
  t.score <- (sample.mean - pop.mean) / sample.se
  t.prob <- pt(t.score, df = sample.size - 1, lower.tail = TRUE)
  t.test.result <- t.test(x = selected.borough.dataset[[selected_col]], 
                          mu = pop.mean, 
                          alternative = t_test_option)
  print(pop.mean)
  print(borough)
  print(sample.mean)
  print(t.test.result)
}

calculate_t(borough.sale.summary, 
            belowe4M, 
            'Manhattan',  
            'mean_sale',  
            'sd_sale', 
            'SALE.PRICE', 'greater')
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(col)` instead of `col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## [1] 660768.8
## [1] "Manhattan"
## [1] 2054627
## 
##  One Sample t-test
## 
## data:  selected.borough.dataset[[selected_col]]
## t = 6.7841, df = 27, p-value = 1.379e-07
## alternative hypothesis: true mean is greater than 660768.8
## 95 percent confidence interval:
##  1704668     Inf
## sample estimates:
## mean of x 
##   2054627

The p-value is 1.379e-07, much smaller than the threshold of 0.05. The result suggests that we accept the alternative hypothesis and acknowledge that the mean sale price of Manhattan’s properties is significantly greater than that of all NYC properties.

I am going to conduct the one-sample t-test on observations from Bronx properties, which has the lowest mean.

calculate_t(borough.sale.summary, 
            belowe4M, 
            'Bronx',  
            'mean_sale',  
            'sd_sale', 
            'SALE.PRICE', 
            'less')
## [1] 660768.8
## [1] "Bronx"
## [1] 517091.1
## 
##  One Sample t-test
## 
## data:  selected.borough.dataset[[selected_col]]
## t = -15.062, df = 1062, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 660768.8
## 95 percent confidence interval:
##      -Inf 532795.2
## sample estimates:
## mean of x 
##  517091.1

The p-value is 2.2e-16, much smaller than the threshold of 0.05. The result suggests that we accept the alternative hypothesis and acknowledge that the mean sale price of Bronx properties is significantly less than that of all NYC properties.

What is the mean/median of median household income in NYC?

In this section, I am going to perform some elementary analyses on the medians of NYC household income by zip code that result from the data cleaning process in the first section.

cal_statistics(income, 'Median.Household.Income.2020.Estimated.')
## [1] "The mean is: 72046.9870347084"
## [1] "The median is: 65352.9277541825"
## [1] "The max is: 239390.2"
## [1] "The min is: 21959.056815471"
## [1] "skewness: 1.38541556598362"
## [1] "kurtosis: 6.33464198442706"
ggplot(data = income, mapping = aes(x = `Median.Household.Income.2020.Estimated.`)) +
  geom_histogram(bins = 30, fill = 'orange') +
  labs(x = 'Median Household Income', y = 'Counts') + 
    scale_x_continuous(labels=function(x) paste0("$", x/1000, "K"))+
   ggtitle('Histogram of Median Household Income') +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite values (stat_bin).

The mean of the median household income in NYC is $72,047, greater than the median. The skewness and kurtosis are 1.4 and 6.3 separately. These numbers indicate that the distribution of the medians of NYC household income by zip code is positively skewed and leptokurtic, where most values center around the mean.

In the histogram above, there are a few bars located between $150K and $250K. We can further inspect whether these are the outliers in our data set.

What is the mean of median household income in each borough?

# Group income data by the five boroughs
# Calculate central tendency and test normality
borough.income.summary <- income %>% 
  group_by(BOROUGH.NAME) %>%
  summarize(mean_income = mean(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE), 
            median_income = median(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE), 
            sd_income = sd(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE), 
            kurtosis_income = kurtosis(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE), 
            skewness_income = skewness(`Median.Household.Income.2020.Estimated.`, na.rm=TRUE))

borough.income.summary
## # A tibble: 5 x 6
##   BOROUGH.NAME mean_income median_income sd_income kurtosis_income
## * <chr>              <dbl>         <dbl>     <dbl>           <dbl>
## 1 Bronx             43779.        36270.    20232.            3.03
## 2 Brooklyn          60916.        55691.    24331.            3.91
## 3 Manhattan         99293.       108221.    46859.            3.14
## 4 Queens            70195.        70780.    14564.            2.98
## 5 Staten Isla…      76733.        77629.    16361.            2.04
## # … with 1 more variable: skewness_income <dbl>
# Plot
income %>% 
  group_by(BOROUGH.NAME) %>%
  ggplot(aes(x=BOROUGH.NAME, y=`Median.Household.Income.2020.Estimated.`)) +
    geom_boxplot(fill='#2a9d8f') +
    stat_summary(fun=mean, 
                 geom="point", 
                 shape=20, 
                 size=5, 
                 color='orange', fill="orange") +
    theme_minimal() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    ggtitle("Median Household Income in Each Borough") +
    labs( x = "Borough", y = 'Median Houhold Income') +
    scale_y_continuous(labels=function(x) paste0("$", x/1000, "K"))
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_summary).

In terms of the median household income (referred to as income in the following paragraphs), Manhattan also has the highest mean and median values while Bronx has the lowest.

In Bronx and Brooklyn, the mean incomes are larger than the median incomes. Their skewness values are both greater than 1. These numbers suggest that the distributions of Bronx and Brooklyn’s observations are positively skewed and most observations situate in the lower end of the range of the distribution.

Mean incomes are slightly lower than median incomes in the other three boroughs; however, their skewness values are still positive. We can further inspect the data by plotting histograms.

income %>% 
  filter(BOROUGH.NAME == 'Manhattan') %>%
  ggplot(aes(x=`Median.Household.Income.2020.Estimated.`)) +
  geom_histogram(bins = 20, fill = 'orange') +
  labs(x = 'Manhattan Median Household Income', y = 'Counts') + 
    scale_x_continuous(labels=function(x) paste0("$", x/1000, "K"))+
   ggtitle('Histogram of Manhattan Median Household Income') +
  theme_minimal()
## Warning: Removed 1 rows containing non-finite values (stat_bin).

income %>% 
  filter(BOROUGH.NAME == 'Queens') %>%
  ggplot(aes(x=`Median.Household.Income.2020.Estimated.`)) +
  geom_histogram(bins = 20, fill = 'orange') +
  labs(x = 'Queens Median Household Income', y = 'Counts') + 
    scale_x_continuous(labels=function(x) paste0("$", x/1000, "K"))+
   ggtitle('Histogram of Queens Median Household Income') +
  theme_minimal()
## Warning: Removed 1 rows containing non-finite values (stat_bin).

income %>% 
  filter(BOROUGH.NAME == 'Staten Island') %>%
  ggplot(aes(x=`Median.Household.Income.2020.Estimated.`)) +
  geom_histogram(bins = 8, fill = 'orange') +
  labs(x = 'Staten Island Median Household Income', y = 'Counts') + 
    scale_x_continuous(labels=function(x) paste0("$", x/1000, "K"))+
   ggtitle('Histogram of Staten Island Median Household Income') +
  theme_minimal()

With some further investigation, we can see that these distributions are not unimodal but bimodal. Take Manhattan’s distribution for example, there is the main mode value that situates at around $130K and a second lower mode value at around $48K. Although most of the values concentrate towards the higher end of the distribution, the second mode pulls the mean below the median.

Is Manhattan average median household income significantly higher than that of NYC as a whole? Is the Bronx average median household income significantly lower than that of NYC as a whole?

Compared with all boroughs in NYC, Manhattan’s median household income is the highest value while that of Bronx is the lowest. But to find out whether there is a statistically significant difference, I am going to conduct a one-sample t-test.

calculate_t(borough.income.summary, 
            income, 
            'Manhattan',  
            'mean_income',  
            'sd_income', 
            'Median.Household.Income.2020.Estimated.', 
            'greater')
## [1] 72046.99
## [1] "Manhattan"
## [1] 99293.21
## 
##  One Sample t-test
## 
## data:  selected.borough.dataset[[selected_col]]
## t = 3.8129, df = 42, p-value = 0.0002217
## alternative hypothesis: true mean is greater than 72046.99
## 95 percent confidence interval:
##  87274.17      Inf
## sample estimates:
## mean of x 
##  99293.21

The p-value is 0.0002217, smaller than the threshold of 0.05. The result suggests that we accept the alternative hypothesis and acknowledge that Manhattan’s mean median household income is significantly greater than that of NYC.

calculate_t(borough.income.summary, income, 'Bronx',  'mean_income',  'sd_income', 'Median.Household.Income.2020.Estimated.', 'less')
## [1] 72046.99
## [1] "Bronx"
## [1] 43779.24
## 
##  One Sample t-test
## 
## data:  selected.borough.dataset[[selected_col]]
## t = -6.9858, df = 24, p-value = 1.593e-07
## alternative hypothesis: true mean is less than 72046.99
## 95 percent confidence interval:
##      -Inf 50702.29
## sample estimates:
## mean of x 
##  43779.24

The p-value is 1.593e-07, much smaller than the threshold of 0.05. The result suggests that we accept the alternative hypothesis and acknowledge that Bronx’s mean median household income is significantly less than that of NYC.

Is there a significant correlation between the sale price and median household income?

I am merging the two data sets and inspecting whether there’s a significant correlation between the sale price and median household income.

# Group the sale data by zip code
zip.grouped.sale <- belowe4M %>%
  group_by(ZIP.CODE) %>%
  summarize(mean_sale = mean(SALE.PRICE), median_sale = median(SALE.PRICE))

# Merge it with the income data
zip.merge <- merge(income, zip.grouped.sale, by = 'ZIP.CODE', all = TRUE)

# Check if two variables are linearly related
# Notes: 'lm' equals linear model
ggplot(data = zip.merge, mapping = aes(x = mean_sale, 
                                       y = `Median.Household.Income.2020.Estimated.`)) +
  geom_point() +
  geom_smooth(method = 'lm') +
  labs(x = 'Mean Sale Price', y = 'Median Income') +
  ggtitle('Mean Sale Price and Median Income by ZIP') +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 34 rows containing non-finite values (stat_smooth).
## Warning: Removed 34 rows containing missing values (geom_point).

# Test normality
shapiro.test(zip.merge$`Median.Household.Income.2020.Estimated.`)
## 
##  Shapiro-Wilk normality test
## 
## data:  zip.merge$Median.Household.Income.2020.Estimated.
## W = 0.91036, p-value = 6.964e-09
shapiro.test(zip.merge$mean_sale)
## 
##  Shapiro-Wilk normality test
## 
## data:  zip.merge$mean_sale
## W = 0.73188, p-value = 3.111e-15
# if the variables are continuous, linearly related and normally distributed, we can move on
cor.test(x = zip.merge$`Median.Household.Income.2020.Estimated.`, y = zip.merge$mean_sale)
## 
##  Pearson's product-moment correlation
## 
## data:  zip.merge$Median.Household.Income.2020.Estimated. and zip.merge$mean_sale
## t = 6.2445, df = 145, p-value = 4.446e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3225020 0.5790934
## sample estimates:
##       cor 
## 0.4603595

The scatter plot suggests some sort of positive correlation between these two variables. And judging from the p-value that is much less than the assumed alpha level of 0.05, the correlation is statistically significant.

The results from the Shapiro-Wilk normality tests suggested that the two variables are not normally distributed and thus cast doubts on the reliability of the positive correlation. But the box plots and t-tests show that Manhattan and the Bronx represent two different sides of the NYC distribution in both income and sale price, providing additional evidence of a positive correlation.

Contexulize how expensive the property prices were using the price-to-income ratio

To better understand how expensive it is to buy a single-family house in NYC, we can calculate the price-to-income ratio by dividing the median sale price by the median household income in an area. According to City Lab, people can afford a home equal to roughly 2.6 years of their household income, meaning that 2.6 years of household income would be enough to purchase a house.

zip.merge <- zip.merge %>%
  mutate(priceIncomeRatio = zip.merge$median_sale / zip.merge$Median.Household.Income.2020.Estimated.)

cal_statistics(zip.merge, 'priceIncomeRatio')
## [1] "The mean is: 15.3226201659883"
## [1] "The median is: 13.0369681646047"
## [1] "The max is: 52.0548249764616"
## [1] "The min is: 4.01838077577956"
## [1] "skewness: 1.67589753710902"
## [1] "kurtosis: 6.27140802547186"
ggplot(data = zip.merge, mapping=aes(x=priceIncomeRatio)) +
  geom_histogram(bins = 30, fill = 'orange') +
  labs(x = 'price-to-income ratio', y = 'counts') + 
   ggtitle('NYC price-to-income ratio by zip') +
  theme_minimal()
## Warning: Removed 34 rows containing non-finite values (stat_bin).

The mean of NYC’s price-to-income ratio is 15.32, indicating that over 15 years of household income was necessary to buy a single-family home, which was a lot higher than the healthy 2.6 price-to-income ratio suggested by the City Lab.

head(zip.merge[order(zip.merge$priceIncomeRatio),])
##     ZIP.CODE                           Name Geographic.Unit
## 181    11697        11697, Breezy Point, NY        Zip Code
## 70     10464               10464, Bronx, NY        Zip Code
## 155    11411     11411, Cambria Heights, NY        Zip Code
## 157    11413 11413, Springfield Gardens, NY        Zip Code
## 166    11422            11422, Rosedale, NY        Zip Code
## 51     10308       10308, Staten Island, NY        Zip Code
##     Median.Household.Income.2020.Estimated.  BOROUGH.NAME mean_sale median_sale
## 181                               107879.27        Queens  477219.6      433500
## 70                                 93400.07         Bronx  568069.5      446160
## 155                                92719.36        Queens  463411.1      485000
## 157                                85894.76        Queens  478181.5      481000
## 166                                89171.11        Queens  493265.6      499495
## 51                                 94284.62 Staten Island  537191.9      531500
##     priceIncomeRatio
## 181         4.018381
## 70          4.776870
## 155         5.230839
## 157         5.599876
## 166         5.601534
## 51          5.637187

The zip codes with the least price-income ratios were 11697 (Queens), 10464 (Bronx), 11411 (Queens), 11413 (Queens) and 11422 (Queens). The zip codes with the highest price-income ratios were 10009 (Manhattan), 10030 (Manhattan), 10027 (Manhattan), 10031 (Manhattan), 11205 (Brooklyn).

zip <- read.csv('./data/uszips.csv')
head(zip)
##   zip      lat       lng      city state_id  state_name zcta parent_zcta
## 1 601 18.18005 -66.75218  Adjuntas       PR Puerto Rico TRUE          NA
## 2 602 18.36074 -67.17519    Aguada       PR Puerto Rico TRUE          NA
## 3 603 18.45440 -67.12201 Aguadilla       PR Puerto Rico TRUE          NA
## 4 606 18.16721 -66.93828   Maricao       PR Puerto Rico TRUE          NA
## 5 610 18.29032 -67.12244    Anasco       PR Puerto Rico TRUE          NA
## 6 612 18.40699 -66.70805   Arecibo       PR Puerto Rico TRUE          NA
##   population density county_fips county_name
## 1      17113   102.7       72001    Adjuntas
## 2      37751   476.0       72003      Aguada
## 3      47081   574.9       72005   Aguadilla
## 4       6392    58.3       72093     Maricao
## 5      26686   286.9       72011      Añasco
## 6      59369   339.1       72013     Arecibo
##                                         county_weights
## 1                  {"72001": "99.43", "72141": "0.57"}
## 2                                     {"72003": "100"}
## 3                                     {"72005": "100"}
## 4 {"72093": "94.88", "72153": "3.78", "72121": "1.35"}
## 5                  {"72011": "99.45", "72003": "0.55"}
## 6                  {"72013": "99.89", "72017": "0.11"}
##              county_names_all   county_fips_all imprecise military
## 1             Adjuntas|Utuado       72001|72141     FALSE    FALSE
## 2                      Aguada             72003     FALSE    FALSE
## 3                   Aguadilla             72005     FALSE    FALSE
## 4 Maricao|Yauco|Sabana Grande 72093|72153|72121     FALSE    FALSE
## 5               Añasco|Aguada       72011|72003     FALSE    FALSE
## 6         Arecibo|Barceloneta       72013|72017     FALSE    FALSE
##              timezone
## 1 America/Puerto_Rico
## 2 America/Puerto_Rico
## 3 America/Puerto_Rico
## 4 America/Puerto_Rico
## 5 America/Puerto_Rico
## 6 America/Puerto_Rico
final <- merge(zip.merge, zip, by.x = 'ZIP.CODE', by.y = 'zip', all.x = TRUE)
final.withoutNA <- final %>% filter(!is.na(lng), !is.na(lat))

# Setting existing coordinate as lat-long system and as in the right format 
cord.dec <- SpatialPoints(cbind(final.withoutNA$lng, final.withoutNA$lat), proj4string = CRS("+proj=longlat"))

# Transforming coordinate to UTM using WGS=84, UTM Zone=18,
cord.utm <- spTransform(cord.dec, CRS("+proj=utm +zone=18T ellps=WGS84"))
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in Proj4
## definition
final.withoutNA$coordsx = as(cord.utm, "data.frame")$coords.x1
final.withoutNA$coordsy = as(cord.utm, "data.frame")$coords.x2

write.csv(final.withoutNA,"./data/zipWithLatLon_property_sales.csv", row.names = FALSE)

Key Takeaways

Next Steps

The above analyses leave several remaining questions open to investigation:

Further analysis will be required to answer these questions.