Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__pycache__/
*.py[cod]
*$py.class
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,39 @@
# pfr_metadata_pull

All of your favorite pfr_metadata_pull code, now in package form! Usage is a simple as:

1. Clone this repo
2. Make sure your local copy of the repo lives in a directory that's in your PYTHONPATH
3. Open up python in your manner of choice and type the following:
```python
import pfr_metadata_pull as meta

meta.scrape_links(start_year, end_year, output_path) # creates a file "game_links_startyear_to_endyear.csv" in the 'output_path' directory
meta.pull_data_from_links("game_links_startyear_to_endyear.csv", output_path) # creates a file "game_meta_data.csv" in the 'output_path' directory
meta.fix_weeks("game_meta_data.csv", output_path) # creates a file "game_meta_data_weeks_fixed.csv" in the 'output_path' directory
meta.format_data("game_meta_data_weeks_fixed.csv", output_path) # creates two files in the 'output_path' directory
```
The final format_data function makes two files - one, "game_meta_data_formatted.csv", is a nice pretty version of the metadata.
The other file, __"game_meta_data_ready_to_merge.csv"__ is what you'll need to add metadata to an existing play-by-play file.
Say you have a file "pbp.csv" that spans some range of seasons, and you just created "game_meta_data_ready_to_merge.csv" for that same range of seasons. Now you can do:
```python
import pandas as pd
pbp = pd.read_csv('pbp.csv')
meta = pd.read_csv('game_meta_data_ready_to_merge.csv')
pbp_meta = pd.merge(pbp, meta, on=['season','week','home_team','away_team'], how='left')
```
Or maybe you do this part in R:
```R
library(tidyverse)
pbp <- read_csv('pbp.csv')
meta <- read_csv('game_meta_data_ready_to_merge.csv')
pbp_meta <- left_join(pbp, meta, by = c("season", "week", "home_team", "away_team"))
```

These changes made by [Dennis Brookner](https://github.com/dennisbrookner); direct concerns to me, or to [Puntalytics](https://twitter.com/ThePuntRunts) on twitter.

### Original README from greerre

This repo contains the set of scripts used to create the dataset referenced here:

https://twitter.com/greerreNFL/status/1146519422527389696
Expand Down
12 changes: 12 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 26 16:25:40 2020

@author: dennisbrookner
"""

from .pfr_game_link_scraper import scrape_links
from .pfr_meta_data_pull import pull_data_from_links
from .week_name_stopgap import fix_weeks
from .pfr_meta_data_format import format_data
145 changes: 85 additions & 60 deletions pfr_game_link_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,65 +12,90 @@
import pandas as pd
import numpy

data_folder = 'file path to folder where all data will be held...no trailing slash'

season_start = 1960
season_end = 2018
current_season = season_start

url_base = 'https://www.pro-football-reference.com'
game_data = []

while current_season <= season_end:
time.sleep((1.5 + random.random() * 2))
url = '{0}/years/{1}/week_1.htm'.format(url_base,current_season)
print('Requesting weeks for the {0} season...'.format(current_season))
raw = requests.get(url)
parsed = BeautifulSoup(raw.content, 'html.parser')
all_anchors = parsed.find_all('a',href=True) ## anchors used b/c commenting makes pulling specific divs hard ##
week_links = []
for a in all_anchors:
if '/years/{0}/week_'.format(current_season) in a.get('href'):
week_info = {
'Week Name' : None,
'Week Link' : None,
}
week_info['Week Name'] = str(a.text)
week_info['Week Link'] = '{0}{1}'.format(url_base,str(a.get('href')))
week_links.append(week_info)
## remove duplicates from week_links ##:
## from https://stackoverflow.com/questions/9427163/remove-duplicate-dict-in-list-in-python ##
seen_links = []
new_link_list = []
for d in week_links:
t = d['Week Link']
if t not in seen_links:
seen_links.append(t)
new_link_list.append(d)
week_links = new_link_list
print(' * Found {0} weeks...'.format(len(week_links)))
for week in week_links:
print(' * Pulling {0} game links'.format(week['Week Name']))
time.sleep((.75 + random.random() * 1.5))
url = week['Week Link']
raw_week = requests.get(url)
parsed_week = BeautifulSoup(raw_week.content, 'html.parser')
week_anchors = parsed_week.find_all('a',href=True)
for a in week_anchors:
if '/boxscores/{0}'.format(current_season) in a.get('href') or '/boxscores/{0}'.format(current_season + 1) in a.get('href'):
box_info = {
'Season' : None,
'Week' : None,
'Week Number' : None,
'Box Score Link' : None,
def scrape_links(start_year, end_year, output_path):
'''
Start year can be as early as 1960; end year can be as late as the current year.

Output path should point to the desired location of the metadata, and should NOT have a trailing slash.

Note that for many seasons, this can be slow!
'''
data_folder = output_path

if output_path[-1] == '/':
raise ValueError("Indicated file path includes a trailing slash, please remove it")


season_start = start_year
season_end = end_year
current_season = season_start

url_base = 'https://www.pro-football-reference.com'
game_data = []

while current_season <= season_end:
time.sleep((1.5 + random.random() * 2))
url = '{0}/years/{1}/week_1.htm'.format(url_base,current_season)
print('Requesting weeks for the {0} season...'.format(current_season))
raw = requests.get(url)
parsed = BeautifulSoup(raw.content, 'html.parser')
all_anchors = parsed.find_all('a',href=True) ## anchors used b/c commenting makes pulling specific divs hard ##
week_links = []
for a in all_anchors:
if '/years/{0}/week_'.format(current_season) in a.get('href'):
week_info = {
'Week Name' : None,
'Week Link' : None,
}
box_info['Season'] = int(current_season)
box_info['Week'] = week['Week Name']
box_info['Week Number'] = int(week.split('/week_')[1].split('.htm')[0])
box_info['Box Score Link'] = '{0}{1}'.format(url_base,str(a.get('href')))
game_data.append(box_info)
current_season += 1

df = pd.DataFrame(game_data)
df = df[['Season', 'Week', 'Week Number', 'Box Score Link']]
df.to_csv('{0}/game_links_{1}_to_{2}.csv'.format(data_folder,season_start,season_end))
week_info['Week Name'] = str(a.text)
week_info['Week Link'] = '{0}{1}'.format(url_base,str(a.get('href')))
week_links.append(week_info)
## remove duplicates from week_links ##:
## from https://stackoverflow.com/questions/9427163/remove-duplicate-dict-in-list-in-python ##
seen_links = []
new_link_list = []
for d in week_links:
t = d['Week Link']
if t not in seen_links:
seen_links.append(t)
new_link_list.append(d)
week_links = new_link_list
print(' * Found {0} weeks...'.format(len(week_links)))
for week in week_links:
print(' * Pulling {0} game links'.format(week['Week Name']))
time.sleep((.75 + random.random() * 1.5))
url = week['Week Link']
raw_week = requests.get(url)
parsed_week = BeautifulSoup(raw_week.content, 'html.parser')
week_anchors = parsed_week.find_all('a',href=True)
for a in week_anchors:
if '/boxscores/{0}'.format(current_season) in a.get('href') or '/boxscores/{0}'.format(current_season + 1) in a.get('href'):
box_info = {
'Season' : None,
'Week' : None,
'Week Number' : None,
'Box Score Link' : None,
}
box_info['Season'] = int(current_season)
box_info['Week'] = week['Week Name']
#box_info['Week Number'] = int(week.split('/week_')[1].split('.htm')[0])
'''
I (Dennis) was unable to get this line to successfully convert Weeks to Week numbers.
If you can manage to do so, feel free to un-comment this line, and then ditch the
"fix weeks" function later in the pipeline
'''
box_info['Box Score Link'] = '{0}{1}'.format(url_base,str(a.get('href')))
game_data.append(box_info)
current_season += 1

df = pd.DataFrame(game_data)
df = df[['Season', 'Week', 'Week Number', 'Box Score Link']]
df.to_csv('{0}/game_links_{1}_to_{2}.csv'.format(data_folder,season_start,season_end))

return

def main():
print('Script was run directly, but this doesn\'t do anything!')

if __name__ == '__main__': main()
Loading