Scraping hockey-reference.com

Daniel Morse

library(hockeyR)
`%>%` <- magrittr::`%>%`

Hockey-Reference scrapers

These functions scrape data from hockey-reference.com.

Grab every team’s win-loss record in any season going back to 1918 with the get_team_records() function

get_team_records(1967) %>%
  dplyr::arrange(-w) %>%
  dplyr::select(team_name, team_abbr, season, overall, w, l, otl, st_points)
#> # A tibble: 6 x 8
#>   team_name           team_abbr season  overall      w     l   otl st_points
#>   <chr>               <chr>     <chr>   <chr>    <int> <int> <int>     <dbl>
#> 1 Chicago Black Hawks CBH       1966-67 41-17-12    41    17    12        94
#> 2 Montreal Canadiens  MTL       1966-67 32-25-13    32    25    13        77
#> 3 Toronto Maple Leafs TOR       1966-67 32-27-11    32    27    11        75
#> 4 New York Rangers    NYR       1966-67 30-28-12    30    28    12        72
#> 5 Detroit Red Wings   DET       1966-67 27-39-4     27    39     4        58
#> 6 Boston Bruins       BOS       1966-67 17-43-10    17    43    10        44

You can also get stats down to the player-level with get_player_stats_hr(). This function defaults to the player’s career statistics, but you can enter a specific season or range of seasons as well. Note that the season references the year the specific season ended (ie the 2021-22 season should be entered as 2022)

get_player_stats_hr(player_name = "Wayne Gretzky", season = 1982) %>%
  dplyr::select(player, age, season_full, tm, gp, g, a, pts)
#> # A tibble: 1 x 8
#>   player          age season_full tm       gp     g     a   pts
#>   <chr>         <int> <chr>       <chr> <int> <int> <int> <int>
#> 1 Wayne Gretzky    21 1981-82     EDM      80    92   120   212

To get the basic counting stats for every skater in a given year, use the get_skater_stats_hr function. Like the other functions in this package, the season argument needs to be the end-year of the season you wish to scrape.

df2 <- get_skater_stats_hr(2022)

df2 %>%
  dplyr::arrange(-goals) %>%
  head()
#> # A tibble: 6 x 30
#>   player       team_~1 season   age posit~2 games~3 goals assists points plus_~4
#>   <chr>        <chr>   <chr>  <int> <chr>     <int> <int>   <int>  <int>   <int>
#> 1 Auston Matt~ TOR     2021-~    24 C            73    60      46    106      20
#> 2 Leon Draisa~ EDM     2021-~    26 C            80    55      55    110      17
#> 3 Chris Kreid~ NYR     2021-~    30 LW           81    52      25     77      19
#> 4 Alex Ovechk~ WSH     2021-~    36 LW           77    50      40     90       8
#> 5 Kyle Connor  WPG     2021-~    25 LW           79    47      46     93      -3
#> 6 Kirill Kapr~ MIN     2021-~    24 LW           81    47      61    108      27
#> # ... with 20 more variables: penalty_minutes <int>, hr_point_shares <dbl>,
#> #   goals_even_strength <int>, goals_powerplay <int>, goals_shorthanded <int>,
#> #   goals_game_winning <int>, assists_even_strength <int>,
#> #   assists_powerplay <int>, assists_shorthanded <int>, shots_on_goal <int>,
#> #   shooting_percent <dbl>, time_on_ice <int>, mean_time_on_ice <chr>,
#> #   blocks <int>, hits <int>, faceoff_wins <int>, faceoff_losses <int>,
#> #   faceoff_win_percent <dbl>, link <chr>, player_id <chr>, and abbreviated ...

You can use the data to make plots with actual team colors and logos as well using the team_logos_colors file included with the package.

# add colors & logos
df3 <- df2 %>%
  dplyr::arrange(-points) %>%
  dplyr::slice(1:10) %>%dplyr::select(player, team_abbr, goals, assists, points) %>%
  dplyr::left_join(team_logos_colors, by = "team_abbr")

# make a bar chart
df3 %>%
  ggplot2::ggplot(ggplot2::aes(stats::reorder(player, -points), points)) +
  ggplot2::geom_col(fill = df3$team_color1, color = df3$team_color2) +
  ggimage::geom_image(
    ggplot2::aes(y = points + 2, image = team_logo_espn),
    size = .05, asp = 1.5
  ) +
  ggplot2::geom_text(ggplot2::aes(y = 2, label = player),
            color = "white", angle = 90, hjust = 0) +
  ggplot2::scale_y_continuous(breaks = scales::pretty_breaks()) +
  ggplot2::theme(
    panel.background = ggplot2::element_rect(fill = "black"),
    plot.background = ggplot2::element_rect(fill = "black"),
    panel.grid.major.x = ggplot2::element_blank(),
    axis.text.x = ggplot2::element_blank(),
    axis.ticks.x = ggplot2::element_blank(),
    axis.text.y = ggplot2::element_text(color = "white"),
    title = ggplot2::element_text(color = "white")
  ) +
  ggplot2::labs(x = NULL, y = "Points",
       title = "Top-10 point scorers in the NHL in 2021-22",
       caption = "data pulled from hockey-reference.com using hockeyR")

Use the get_rosters() function to look up a team roster at season’s end for any prior season. By default, it will only pull basic player info (name, age, height & weight, etc), but you can grab all the basic counting stats by setting include_stats to TRUE. Note as shown below that the team argument accepts both full team names or team abbreviations.

player_stats <- get_rosters(c("COL","Detroit red wings"), season = 2001, include_stats = TRUE) %>%
  dplyr::mutate(
    g_60 = 60 * g / toi,
    a_60 = 60 * a /toi,
    p_60 = 60 * pts / toi
  ) %>%
  dplyr::filter(toi >= 300) %>%
  dplyr::left_join(team_logos_colors, by = "team_abbr")

top_performers <- dplyr::filter(
      player_stats,
      p_60 >= dplyr::arrange(player_stats, -p_60) %>%
        dplyr::slice(10) %>%
        dplyr::pull(p_60)
      )

player_stats %>%
  ggplot2::ggplot(ggplot2::aes(a_60,g_60)) +
  ggplot2::geom_hline(yintercept = 60 * sum(player_stats$g) / sum(player_stats$toi),
             linetype = "dashed", color = "black") +
  ggplot2::geom_vline(xintercept = 60 * sum(player_stats$a) / sum(player_stats$toi),
             linetype = "dashed", color = "black") +
  #geom_point(aes(size = toi), show.legend = FALSE,
  #           color = player_stats$team_color_alt1, alpha = .8) +
  ggimage::geom_image(ggplot2::aes(image = team_logo_espn),
                      size = 0.07, asp = 1.5) +
  ggrepel::geom_text_repel(
    data = top_performers,
    ggplot2::aes(label = player),
    color = top_performers$team_color_alt1
  ) +
  ggplot2::scale_y_continuous(breaks = scales::pretty_breaks()) +
  ggplot2::scale_x_continuous(breaks = scales::pretty_breaks()) +
  ggplot2::theme(
    panel.background = ggplot2::element_rect(fill = "#708090"),
    plot.background = ggplot2::element_rect(fill = "#708090"),
    title = ggplot2::element_text(color = "white")
  ) +
  ggplot2::labs(x = "Assists/60", y = "Goals/60",
       title = "2000-01 Wings v Avs, regular season stats",
       subtitle = "min. 300 minutes",
       caption = "data pulled from hockey-reference.com using hockeyR")