Importing and merging datasets [10 points]

In this homework, we’ll adapt data from Chicago-IL communities. The original source is the GeoDa Data and Lab. The first dataset Airbnb.RDS refers to Airbnb rentals, socioeconomic indicators, and crime by community area in Chicago. The Communities.xls dataset contains health and socioeconomic indicators for the 77 community areas of Chicago, 2012-2014.

  1. Import both datasets and use the function head() to see only the first five rows from each one. This might help you to import the Communities data.

  2. Merge the datasets using the function full_join(). What is the dimension of your new dataset? What variables do they have in common (variables with the same column name)?

a)

library(tidyverse)
library(readxl)

airbnb<-readRDS("airbnb.RDS")
head(airbnb,5)
##         community    shape_area     shape_len ComAreaID response_r accept_r rev_rating
## 1         DOUGLAS 46004621.1581 31027.0545098        35   98.77143 94.51429   87.77778
## 2         OAKLAND 16913961.0408 19565.5061533        36   99.20000 90.10526   88.81250
## 3     FULLER PARK 19916704.8692          <NA>        37   68.00000       NA   91.75000
## 4 GRAND BOULEVARD 48492503.1554 28196.8371573        38   94.03704 83.61539   92.75000
## 5         KENWOOD 29071741.9283 23325.1679062        39   92.54286 88.14286   90.65625
##    price_pp room_type num_spots poverty crowded dependency without_hs unemployed
## 1  78.15789  1.789474        38    29.6     1.8       30.7       14.3       18.2
## 2  53.77500  1.850000        20    39.7     1.3       40.4       18.4       28.7
## 3  84.00000  1.833333         6    51.2     3.2       44.9       26.6       33.9
## 4 119.53333  1.533333        30    29.3     3.3       39.5       15.9       24.3
## 5  77.99145  1.615385        39    21.7     2.4       35.4       11.3       15.7
##   income_pc harship_in num_crimes num_theft population     dist     area
## 1     23791         47       5013      1241      18238 4.531537 4.273829
## 2     19252         78       1306       311       5918 6.029868 1.571301
## 3     10432         97       1764       383       2876 7.429596 1.850268
## 4     23472         57       6416      1428      21929 6.990224 4.504953
## 5     35911         26       2713       654      17841 7.776572 2.700750
communities<-read_excel("communities.xls", sheet="chitown")
head(communities,5)
## # A tibble: 5 x 87
##   ...1  ComAreaID community TRACTCnt shape_area shape_len Pop2012 Pop2014 PopChng PopM 
##   <chr> <chr>     <chr>     <chr>         <dbl>     <dbl> <chr>   <chr>     <dbl> <chr>
## 1 0     35        DOUGLAS   10        46004621.    31027. 18238   19430      3.16 8294 
## 2 1     36        OAKLAND   3         16913961.    19566. 5918    6473       4.48 2499 
## 3 2     37        FULLER P~ 2         19916705.    25339. 2876    2543      -6.14 1218 
## 4 3     38        GRAND BO~ 14        48492503.    28197. 21929   22531      1.35 9681 
## 5 4     39        KENWOOD   7         29071742.    23325. 17841   18217      1.04 8543 
## # ... with 77 more variables: PopMP <dbl>, PopF <chr>, PopFP <dbl>, Under5 <chr>,
## #   Under5P <dbl>, Under18 <chr>, Under18P <dbl>, Over18 <chr>, Over18P <dbl>,
## #   Over21 <chr>, Over21P <dbl>, Over65 <chr>, Over65P <dbl>, Wht14 <chr>, Wht14P <dbl>,
## #   Blk14 <chr>, Blk14P <dbl>, AI14 <chr>, AI14P <dbl>, AS14 <chr>, AS14P <dbl>,
## #   NHP14 <chr>, NHP14P <dbl>, Oth14 <chr>, Oth14P <dbl>, Hisp14 <chr>, Hisp14P <dbl>,
## #   Property_C <chr>, PropCrRt <dbl>, Violent_C <chr>, VlntCrRt <dbl>, PerCInc14 <chr>,
## #   PPop14 <chr>, Pov14 <chr>, field_37 <dbl>, ChldPov14 <chr>, NoHS14 <chr>,
## #   HSGrad14 <chr>, SmClg14 <chr>, ClgGrad14 <chr>, LaborFrc <chr>, Unemp14 <chr>,
## #   Pov50 <chr>, Pov50P <dbl>, Pov125 <chr>, Pov125P <dbl>, Pov150 <chr>, Pov150P <dbl>,
## #   Pov185 <chr>, Pov185P <dbl>, Pov200 <chr>, Pov200P <dbl>, COIave <dbl>, HISave <dbl>,
## #   SESave <dbl>, Hlitave <dbl>, BirthRate <dbl>, FertRate <chr>, LoBirthR <chr>,
## #   PrenScrn <chr>, PretBrth <dbl>, TeenBirth <dbl>, Assault <dbl>, BrstCancr <dbl>,
## #   CancerAll <dbl>, Colorect <dbl>, DiabetM <dbl>, FirearmM <dbl>, InfntMR <dbl>,
## #   LungCancer <dbl>, ProstateC <dbl>, Stroke <dbl>, ChlBLLS <dbl>, ChlLeadP <chr>,
## #   GonorrF <dbl>, GonorrM <dbl>, Tuberc <dbl>

b)

full_data<-full_join(airbnb, communities, by="community")
dim(full_data)
## [1]  77 108
names(full_data)
##   [1] "community"    "shape_area.x" "shape_len.x"  "ComAreaID.x"  "response_r"  
##   [6] "accept_r"     "rev_rating"   "price_pp"     "room_type"    "num_spots"   
##  [11] "poverty"      "crowded"      "dependency"   "without_hs"   "unemployed"  
##  [16] "income_pc"    "harship_in"   "num_crimes"   "num_theft"    "population"  
##  [21] "dist"         "area"         "...1"         "ComAreaID.y"  "TRACTCnt"    
##  [26] "shape_area.y" "shape_len.y"  "Pop2012"      "Pop2014"      "PopChng"     
##  [31] "PopM"         "PopMP"        "PopF"         "PopFP"        "Under5"      
##  [36] "Under5P"      "Under18"      "Under18P"     "Over18"       "Over18P"     
##  [41] "Over21"       "Over21P"      "Over65"       "Over65P"      "Wht14"       
##  [46] "Wht14P"       "Blk14"        "Blk14P"       "AI14"         "AI14P"       
##  [51] "AS14"         "AS14P"        "NHP14"        "NHP14P"       "Oth14"       
##  [56] "Oth14P"       "Hisp14"       "Hisp14P"      "Property_C"   "PropCrRt"    
##  [61] "Violent_C"    "VlntCrRt"     "PerCInc14"    "PPop14"       "Pov14"       
##  [66] "field_37"     "ChldPov14"    "NoHS14"       "HSGrad14"     "SmClg14"     
##  [71] "ClgGrad14"    "LaborFrc"     "Unemp14"      "Pov50"        "Pov50P"      
##  [76] "Pov125"       "Pov125P"      "Pov150"       "Pov150P"      "Pov185"      
##  [81] "Pov185P"      "Pov200"       "Pov200P"      "COIave"       "HISave"      
##  [86] "SESave"       "Hlitave"      "BirthRate"    "FertRate"     "LoBirthR"    
##  [91] "PrenScrn"     "PretBrth"     "TeenBirth"    "Assault"      "BrstCancr"   
##  [96] "CancerAll"    "Colorect"     "DiabetM"      "FirearmM"     "InfntMR"     
## [101] "LungCancer"   "ProstateC"    "Stroke"       "ChlBLLS"      "ChlLeadP"    
## [106] "GonorrF"      "GonorrM"      "Tuberc"

For the repeated column names - shape_area, shape_len, and ComAreaID -, full_join identifies the ones from the first dataset (airbnb) as .x and from the second (communities) as .y. If you merge by ComAreaID, you will have instead community.x and community.y.

Subsetting, creating new variables & summarizing information [65 Points]

Important: You can find more information about the variables in the Airbnb and Communities datasets here and here. I added two more variables to the Airbnb data: area and dist. They represent the total community area and the distance (in km) from the community to Chicago downtown, respectively.

  1. You want to work only with the following columns: community, price_pp, num_spots, rev_rating, PerCInc14 , num_theft, FirearmM ,unemployed, harship_in, Pop2014, BirthRate, Over65, dist, and area. Select only those variables and store them in a new data frame. [5 points]

  2. Create the new variable theft_rate dividing the total number of thefts by the population in 2014. [5 points]

  3. First, divide the total population in 2014 by the community’s area to get values for population density (number of people per square mile). Then, create the new variable logdens, taking the natural logarithm of population density. [5 points]

  4. Filter your new dataset to identify Central Chicago. In other words, you want to filter communities within \(3 km\) from Chicago downtown.
    What is the average number of Airbnb spots in Central Chicago? What are the average Airbnb prices, per capita income, theft rate, firearm-related deaths, population density, and birth rate in Central Chicago? [15 points]

  5. Compare those numbers you just got from Central Chicago (Airbnb spots and prices, income, crime, and birth rate) with the values from Central Chicago adjacent communities - communities that share a border with Central Chicago. Discuss the results. [15 points]

  6. Finally, compare the values for the same variables in Central Chicago with the average numbers from “Far from downtown” - the communities that have a distance from downtown higher than 19 km. [15 points]

  7. Do you have any explanation for the presence of birth rate in d., e., and f. (i.e did you expect to find a relationship between birth rate and distance from downtown)? [5 points]

a)

sub_data<-full_data%>%dplyr::select(community, price_pp, num_spots, 
                                    rev_rating, PerCInc14 , num_theft, 
                                    FirearmM ,unemployed, harship_in, 
                                    Pop2014, BirthRate, Over65, dist, area)

b)

When you try to divide num_theft by Pop2014, you get an error: non-numeric argument to binary operator. Pop2014 was imported as character instead of numeric. You can check the class of all variables using class(). To check the class for all columns, use lapply().

lapply(sub_data,class)
## $community
## [1] "character"
## 
## $price_pp
## [1] "numeric"
## 
## $num_spots
## [1] "numeric"
## 
## $rev_rating
## [1] "numeric"
## 
## $PerCInc14
## [1] "character"
## 
## $num_theft
## [1] "numeric"
## 
## $FirearmM
## [1] "numeric"
## 
## $unemployed
## [1] "numeric"
## 
## $harship_in
## [1] "numeric"
## 
## $Pop2014
## [1] "character"
## 
## $BirthRate
## [1] "numeric"
## 
## $Over65
## [1] "character"
## 
## $dist
## [1] "numeric"
## 
## $area
## [1] "units"

So before get the theft rate, you’ll need to transform those variables to numeric.

sub_data<-sub_data%>%mutate(theft_rate=num_theft/as.numeric(Pop2014))

## Another way to do it:
## sub_data$num_theft<-sub_data$num_theft/as.numeric(sub_data$Pop2014)

c)

sub_data$popdens=as.numeric(sub_data$Pop2014)/as.numeric(sub_data$area)
sub_data$logdens=log(sub_data$popdens)

d)

central<-sub_data%>%filter(dist<=3)

central%>%summarize(avg_price=mean(price_pp),
                    avg_spots=mean(as.numeric(num_spots)),
                    avg_income=mean(as.numeric(PerCInc14)),
                    avg_theftrate=mean(theft_rate), 
                    avg_firearm=mean(FirearmM), 
                    avg_popdems=mean(popdens),
                    avg_birth=mean(BirthRate))
##   avg_price avg_spots avg_income avg_theftrate avg_firearm avg_popdems avg_birth
## 1  150.3437  277.6667   714819.3     0.1467342         4.6    7905.481  13.83333

e)

That was the way I approached the problem:

IDs<-c(7,24,28,31,34,35)
adjacent<-full_data%>%dplyr::select(community, ComAreaID.x)%>%filter(ComAreaID.x%in%IDs)
adjacent
##         community ComAreaID.x
## 1         DOUGLAS          35
## 2       WEST TOWN          24
## 3  NEAR WEST SIDE          28
## 4 LOWER WEST SIDE          31
## 5   ARMOUR SQUARE          34
## 6    LINCOLN PARK           7

There you have the names of adjacent communities. Then you can filter sub_data for those adjacent community names you collected in adjacent.

neigh_central<-sub_data%>%filter(community%in%adjacent$community)
neigh_central%>%summarize(avg_price=mean(price_pp),
                    avg_spots=mean(as.numeric(num_spots)),
                    avg_income=mean(as.numeric(PerCInc14)),
                    avg_theftrate=mean(theft_rate), 
                    avg_firearm=mean(FirearmM), 
                    avg_popdems=mean(popdens),
                    avg_birth=mean(BirthRate))
##   avg_price avg_spots avg_income avg_theftrate avg_firearm avg_popdems avg_birth
## 1  99.42827  262.8333   659005.2    0.05955793         6.8    5614.339     14.75

f)

far_neigh<-sub_data%>%filter(dist>19)
far_neigh%>%summarize(avg_price=mean(price_pp, na.rm=T),
                    avg_spots=mean(as.numeric(num_spots)),
                    avg_income=mean(as.numeric(PerCInc14)),
                    avg_theftrate=mean(theft_rate), 
                    avg_firearm=mean(FirearmM), 
                    avg_popdems=mean(popdens),
                    avg_birth=mean(BirthRate))
##   avg_price avg_spots avg_income avg_theftrate avg_firearm avg_popdems avg_birth
## 1  59.64167         2   124273.6    0.02839486       18.87    2252.921     13.82

Discussion

The average number of Airbnb spots in communities far from the city center is considerably lower than the number of sites in the center or close to the center. On “demand terms”: people will consider Loop and Near North Side as the best areas to stay and enjoy walking distance amenities. However, the prices there are much higher, so communities near the city’s center, with easy access to public transportation and reasonable commuting time, are good alternatives. That is one reason you see as many spots in the adjacent communities as downtown. Finally, the average Airbnb price, income, and non-violent crime rate are considerably lower as you go farther from the center. On the other hand, firearm-related deaths are much higher in distant communities compared to the downtown area.

The average birth rate does not have a clear pattern. Maybe including the suburban area, you might see a pattern. Couples raising a family need more space, and the price per square foot in the suburbs is considerably lower than downtown.

Go one step further and divide the communities in north and south: they are considerably different in many aspects.

Scatter plots [35 Points] Scatter plots

  1. One of the regularities within cities predicted by the Alonso-Muth-Mills model is the decline of population density with distance to the Central Business District. Build a scatterplot to show that this is happening in Chicago, i.e., there is a negative relationship between logdens (y-axis) and dist (x-axis) in our dataset. [10 points]

  2. Now, let’s take a look at the relationship between Airbnb prices and the distance to Downtown Chicago. Construct a scatter plot between those variables (y-axis prices and x-axis distance). What do you see? What is the reasoning behind this relationship? [10 points]

Correlation x Causality
  1. Finally, plot the relationship between Airbnb prices (y-axis) and theft rate (x-axis). What do you see? Do you have an explanation for that relationship to hold? [15 points]

a)

library(ggthemes)
library(ggrepel)

ggplot(sub_data, aes(x=dist, y=logdens)) + 
  geom_point(color="darkgrey") + 
  geom_text_repel(aes(label=community), size=3)+
  stat_smooth(method = "lm", formula =y~x, se=F, color="black") +
  scale_x_continuous(name = "Distance from Navey Pier") +
  scale_y_continuous(name = "Log Population Density") +
  theme_economist_white(base_size = 17, gray_bg=FALSE)+
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"))

b)

ggplot(sub_data, aes(x=dist, y=price_pp)) + 
  geom_point(color="darkgrey") + 
  geom_text_repel(aes(label=community), size=3)+
  stat_smooth(method = "lm", formula =y~x, se=F, color="black") +
  scale_x_continuous(name = "Distance from Navey Pier") +
  scale_y_continuous(name = "Airbnb Prices") +
  theme_economist_white(base_size = 17, gray_bg=FALSE)+
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"))

As one can see above, there is a negative relationship between Airbnb prices and distance from Navy Pier. That was expected since the city center also concentrates the entertainment options. You can also think about the relationship between the house value and the price charged by the host. The price per square foot is higher in downtown, so is the Airbnb price.

c)

ggplot(sub_data, aes(x=theft_rate, y=price_pp)) + 
  geom_point(color="darkgrey") + 
  geom_text_repel(aes(label=community), size=3)+
  stat_smooth(method = "lm", formula =y~x, se=F, color="black") +
  scale_x_continuous(name = "Theft Rate") +
  scale_y_continuous(name = "Airbnb Prices") +
  theme_economist_white(base_size = 17, gray_bg=FALSE)+
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"))

There is a positive relationship between theft rate and Airbnb prices. However, that does not mean that more thefts cause higher airbnb prices. Downtown concentrates visitors and tourists and, therefore, more opportunities for theft.