library(dplyr)
library(stringr)
library(rvest)
library(sf)
::from(tidygeocoder, geocode)
import::from(purrr, map, map2)
import::from(tidyr, unnest) import
Finding all The Coffee shops in Brazil
In this post, I’ll show you how to map every The Coffee shop in Brazil in less time than it takes to brew a pot of coffee. All this from your laptop and without spending a dime.
We’ll use only R
and a few packages to webscrape all addresses.
What is The Coffee
The Coffee is a Japanese-inspired chain of coffee shops with a distinct minimalist visual identity. Their street shops are small, clean, and extremely space-efficient, sometimes taking less than 20 m2. Most shops are for takeout only, with limited seating. They offer a wide variety of high quality coffee at a premium price point.
The company was founded in Curitiba, at the southern part of Brazil, in 2018, and has expanded rapidly to 12 countries with over 200 shops. Their franchise model in part explains this strong expansion.
Similar to Starbucks, product customization is a major selling point. Customers can choose and replace pretty much everything in their drinks, from adding and additional espresso shot, requiring an additional pump of chocolate syrup. Unlike Starbucks, however, most The Coffee shops are strictly to-go, or offer only minimal seating capacity. The Coffee (usually) doesn’t aim at becoming a 3rd place, where friends meet to share a cup of coffee, or work colleagues schedule a meeting. That said, there are exceptions and some shops do include tables and even
The Coffee also strays away from the traditional friendly-neighborhood barista and instead focuses on a more technological approach. Customers mainly interact with a tablet that displays the menu and all customization choices. Friendly chatter is optional, as a customer can get in, get his coffee without exchanging any words with the barista.
Webscraping
We’ll leverage the power of R, an open-source programming language that’s widely used in data science. Using R offers numerous advantages: it’s free, the code can be reused and adapted to various contexts, and its strong emphasis on reproducibility ensures that your analyses can be replicated by others.
Setup
Finding the data
The data is extracted from The Coffee’s website. There is no single recipe or approach for webscraping: each website is organized differently, though there are patterns. In the case of The Coffee, units are separated by country and city; additionally each unit is identified by a name and has an address
The website
The site presents every unit by country and city. The print below shows an example unit in Belo Horizonte, Brazil.
Using R
The code below
# Base url
<- "https://thecoffee.jp/shortcut/brasil"
base_url # Parse HTML
<- xml2::read_html(base_url)
page_list_brazil
# Gets the urls for all cities in Brazil
<- page_list_brazil |>
page_list_cities html_elements(xpath = "//div/ul/li/a") |>
html_attr("href")
<- page_list_cities[str_detect(page_list_cities, "brasil/")]
page_list_cities
<- str_c(base_url, str_remove(page_list_cities, "shortcut/brasil/")) url_cities
The code below is a function the scrapes the information of all shops for a given url of a city. The output is a simple data.frame
.
<- function(url) {
scrape_the_coffee
# Parse the html
<- xml2::read_html(url)
page # Find the the name of the shop
<- page |>
coffee_shop_name ::html_elements(xpath = "//div/ul/li/div/div/a/h4") |>
rvest::html_text()
rvest# Find the address of the shop
<- page |>
address_list ::html_elements(xpath = "//div/ul/li/div/div/a/p") |>
rvest::html_text()
rvest# Remove shops that are not open yet
<- address_list[!str_detect(address_list, "coming soon")]
address_list <- address_list[seq(1, length(address_list), 2)]
street_name <- address_list[2]
city_name
<- paste(street_name, city_name)
full_address
# Store results in a tibble
<- tibble::tibble(
out name = coffee_shop_name,
address = full_address,
street_name = street_name,
city_name = city_name
)
return(out)
}
Functional approach
The simplest approach to implement this funtion is applying it over a vector with all urls for all cities. This approach is usually safer and quicker and can be scalled with parallel processing (e.g. parallel::mclapply
).
# Scrape all cities
<- map(url_cities, scrape_the_coffee)
coffee_locations names(coffee_locations) <- url_cities
<- bind_rows(coffee_locations, .id = "url") dat
Loop approach
<- txtProgressBar(min = 1, max = length(url_cities), style = 3)
pb <- vector("list", length(url_cities))
ls
for (i in seq_along(url_cities)) {
<- url_cities[i]
url <- basename(url)
current_city message("Scraping data for: ", current_city)
<- scrape_the_coffee(url)
ls[[i]] setTxtProgressBar(pb, i)
Sys.sleep(runif(1, min = 1, max = 5))
}
Cleaning the data
<- function() {
unnabreviate c("Av\\." = "Avenida",
"Al\\." = "Alameda",
"R\\." = "Rua",
"Dr\\." = "Doutor",
"Visc\\." = "Visconde",
"Pres\\." = "Presidente",
"Mal\\." = "Marechal")
}
<- dat |>
dat mutate(
city_name = str_remove(city_name, " - Brasil"),
address = str_replace_all(address, unnabreviate()),
country = "Brasil"
)
Geocoding
Geocoding the data is fairly straightforward. I use the Google Maps API via the tidygeocoder
package to find the corresponding lat/lng pair for each address.
# Geocode using Maps API
<- tidygeocoder::geocode(
coffee
dat,address = address,
method = "google"
)
# Convert to spatial data.frame
<- st_as_sf(
shops
coffee,coords = c("long", "lat"),
crs = 4326,
remove = FALSE
)
Results
City | # Shops | Share BR (%) |
---|---|---|
São Paulo | 53 | 26.50% |
Curitiba | 29 | 14.50% |
Brasília | 21 | 10.50% |
Rio de Janeiro | 13 | 6.50% |
Fortaleza | 10 | 5.00% |
Porto Alegre | 9 | 4.50% |
Florianópolis | 6 | 3.00% |
Vitória | 5 | 2.50% |
Campinas | 4 | 2.00% |
Belo Horizonte | 3 | 1.50% |
leaflet(shops) %>%
addTiles() %>%
addMarkers(label = ~name) %>%
addProviderTiles("CartoDB") %>%
setView(lng = -46.65590, lat = -23.561197, zoom = 12)
Get Google Maps ratings
<- filter(shops, city_name == "Curitiba")
cur_shops
<- function(lat, lng) {
get_ratings
<- c(lat, lng)
location <- google_places("The Coffee", location = location, radius = 10)
places <- places$results
res
<- res %>%
subres unnest(cols = "geometry") %>%
unnest(cols = "location") %>%
select(
business_status, name, formatted_address, rating, user_ratings_total,
lat, lng
)
}
<- map2(cur_shops$lat, cur_shops$long, get_ratings)
ratings
<- ratings |>
dat_ratings bind_rows() |>
distinct() |>
filter(str_detect(name, "^The Coffee"))
<- dat_ratings |>
dat_ratings mutate(
street_name = str_extract(formatted_address, "^[^,]+"),
street_name = str_replace_all(street_name, unnabreviate()),
street_name = stringi::stri_trans_general(street_name, "latin-ascii"),
street_number = as.numeric(str_extract(formatted_address, "(?<=, )\\d+(?=\\b)"))
)
Merging with Census information
The code below shows how to gather census tract data for each The Coffee shop. Census tracts are the smallest administrative division that present socioeconomic and demographic data.
Code
# Interpolation -------------------------------------------------------------
## Functions ---------------------------------------------------------------
# Creates a n minute walk isochrone around a point
<- function(point, radius = 5, simplified = FALSE) {
get_buffer
stopifnot(length(radius) == 1 && is.numeric(radius))
if (simplified) {
|>
point ::st_transform(crs = 31982) |>
sf# Simplified assumption
::st_buffer(dist = ((1.5 - 1.2) / 2 + 1.2) * 60 * radius)
sfelse {
} |>
point ::st_transform(crs = 31982) |>
sf::osrmIsochrone(breaks = radius, osrm.profile = "foot") |>
osrm::st_remove_holes()
nngeo
}
}
# Interpolates an area with census tracts and aggregates population and households
<- function(census, target, variables = c("v0001", "v0003")) {
interpolate_census
if (st_crs(census) != st_crs(target)) {
warning("CRS mismatch")
<- st_transform(census, crs = 31982)
census <- st_transform(target, crs = 31982)
target
}
# Select variables
<- dplyr::select(census, dplyr::all_of(variables))
census # Interpolate areas
<- st_interpolate_aw(census, target, extensive = TRUE)
interp
return(interp)
}
# Wrapper around get_buffer and interpolate_census
<- function(shop, census, radius = 5, simplified = FALSE) {
find_population
# Compute a 5-minute isochrone around
<- get_buffer(shop, radius, simplified)
buffer <- suppressWarnings(interpolate_census(census, buffer))
interpolated
return(interpolated)
}
## Interpolate -------------------------------------------------------------
# Uniquely identifies each shop
<- city_shops |>
city_shops mutate(shop_id = row_number())
# To improve speed convert the full census data to 31982
<- st_transform(city_census, crs = 31982)
city_census_utm
<- parallel::mclapply(
city_shops_census split(city_shops, city_shops$shop_id),
find_population(x, census = city_census_utm)
\(x)
)
<- bind_rows(city_shops_census, .id = "shop_id")
city_shops_census
<- city_shops |>
city_shops mutate(shop_id = as.character(shop_id)) |>
left_join(st_drop_geometry(city_shops_census))
Reading layer `curitiba_tcf_interpolate' from data source
`/Users/viniciusoike/Documents/GitHub/restateinsight/static/data/curitiba_tcf_interpolate.gpkg'
using driver `GPKG'
Simple feature collection with 29 features and 11 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -49.32655 ymin: -25.45011 xmax: -49.21461 ymax: -25.40498
Geodetic CRS: WGS 84