greerre · dennisbrookner · May 26, 2020 · May 26, 2020 · May 26, 2020 · May 27, 2020
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+*.py[cod]
+*$py.class
diff --git a/README.md b/README.md
@@ -1,5 +1,39 @@
 # pfr_metadata_pull
 
+All of your favorite pfr_metadata_pull code, now in package form!  Usage is a simple as:  
+
+1. Clone this repo  
+2. Make sure your local copy of the repo lives in a directory that's in your PYTHONPATH  
+3. Open up python in your manner of choice and type the following:  
+```python
+import pfr_metadata_pull as meta
+
+meta.scrape_links(start_year, end_year, output_path) # creates a file "game_links_startyear_to_endyear.csv" in the 'output_path' directory
+meta.pull_data_from_links("game_links_startyear_to_endyear.csv", output_path) # creates a file "game_meta_data.csv" in the 'output_path' directory
+meta.fix_weeks("game_meta_data.csv", output_path)  # creates a file "game_meta_data_weeks_fixed.csv" in the 'output_path' directory
+meta.format_data("game_meta_data_weeks_fixed.csv", output_path)  # creates two files in the 'output_path' directory
+```
+The final format_data function makes two files - one, "game_meta_data_formatted.csv", is a nice pretty version of the metadata.  
+The other file, __"game_meta_data_ready_to_merge.csv"__ is what you'll need to add metadata to an existing play-by-play file.  
+Say you have a file "pbp.csv" that spans some range of seasons, and you just created "game_meta_data_ready_to_merge.csv" for that same range of seasons.  Now you can do:
+```python
+import pandas as pd
+pbp = pd.read_csv('pbp.csv')
+meta = pd.read_csv('game_meta_data_ready_to_merge.csv')
+pbp_meta = pd.merge(pbp, meta, on=['season','week','home_team','away_team'], how='left')
+```
+Or maybe you do this part in R:
+```R
+library(tidyverse)
+pbp <- read_csv('pbp.csv')
+meta <- read_csv('game_meta_data_ready_to_merge.csv')
+pbp_meta <- left_join(pbp, meta, by = c("season", "week", "home_team", "away_team"))
+```
+
+These changes made by [Dennis Brookner](https://github.com/dennisbrookner); direct concerns to me, or to [Puntalytics](https://twitter.com/ThePuntRunts) on twitter.
+
+### Original README from greerre
+
 This repo contains the set of scripts used to create the dataset referenced here:
 
 https://twitter.com/greerreNFL/status/1146519422527389696

diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue May 26 16:25:40 2020
+
+@author: dennisbrookner
+"""
+
+from .pfr_game_link_scraper import scrape_links
+from .pfr_meta_data_pull import pull_data_from_links
+from .week_name_stopgap import fix_weeks
+from .pfr_meta_data_format import format_data
diff --git a/pfr_game_link_scraper.py b/pfr_game_link_scraper.py
@@ -12,65 +12,90 @@
 import pandas as pd
 import numpy
 
-data_folder = 'file path to folder where all data will be held...no trailing slash'
 
-season_start = 1960
-season_end = 2018
-current_season = season_start
-
-url_base = 'https://www.pro-football-reference.com'
-game_data = []
-
-while current_season <= season_end:
-    time.sleep((1.5 + random.random() * 2))
-    url = '{0}/years/{1}/week_1.htm'.format(url_base,current_season)
-    print('Requesting weeks for the {0} season...'.format(current_season))
-    raw = requests.get(url)
-    parsed = BeautifulSoup(raw.content, 'html.parser')
-    all_anchors = parsed.find_all('a',href=True) ## anchors used b/c commenting makes pulling specific divs hard ##
-    week_links = []
-    for a in all_anchors:
-        if '/years/{0}/week_'.format(current_season) in a.get('href'):
-            week_info = {
-                'Week Name' : None,
-                'Week Link' : None,
-            }
-            week_info['Week Name'] = str(a.text)
-            week_info['Week Link'] = '{0}{1}'.format(url_base,str(a.get('href')))
-            week_links.append(week_info)
-    ## remove duplicates from week_links ##:
-    ## from https://stackoverflow.com/questions/9427163/remove-duplicate-dict-in-list-in-python ##
-    seen_links = []
-    new_link_list = []
-    for d in week_links:
-        t = d['Week Link']
-        if t not in seen_links:
-            seen_links.append(t)
-            new_link_list.append(d)
-    week_links = new_link_list
-    print('   * Found {0} weeks...'.format(len(week_links)))
-    for week in week_links:
-        print('      * Pulling {0} game links'.format(week['Week Name']))
-        time.sleep((.75 + random.random() * 1.5))
-        url = week['Week Link']
-        raw_week = requests.get(url)
-        parsed_week = BeautifulSoup(raw_week.content, 'html.parser')
-        week_anchors = parsed_week.find_all('a',href=True)
-        for a in week_anchors:
-            if '/boxscores/{0}'.format(current_season) in a.get('href') or '/boxscores/{0}'.format(current_season + 1) in a.get('href'):
-                box_info = {
-                    'Season' : None,
-                    'Week' : None,
-                    'Week Number' : None,
-                    'Box Score Link' : None,
+def scrape_links(start_year, end_year, output_path):
+    '''
+    Start year can be as early as 1960; end year can be as late as the current year.
+
+    Output path should point to the desired location of the metadata, and should NOT have a trailing slash.
+
+    Note that for many seasons, this can be slow!
+    '''
+    data_folder = output_path
+
+    if output_path[-1] == '/':
+        raise ValueError("Indicated file path includes a trailing slash, please remove it")
+
+
+    season_start = start_year
+    season_end = end_year
+    current_season = season_start
+
+    url_base = 'https://www.pro-football-reference.com'
+    game_data = []
+
+    while current_season <= season_end:
+        time.sleep((1.5 + random.random() * 2))
+        url = '{0}/years/{1}/week_1.htm'.format(url_base,current_season)
+        print('Requesting weeks for the {0} season...'.format(current_season))
+        raw = requests.get(url)
+        parsed = BeautifulSoup(raw.content, 'html.parser')
+        all_anchors = parsed.find_all('a',href=True) ## anchors used b/c commenting makes pulling specific divs hard ##
+        week_links = []
+        for a in all_anchors:
+            if '/years/{0}/week_'.format(current_season) in a.get('href'):
+                week_info = {
+                    'Week Name' : None,
+                    'Week Link' : None,
                 }
-                box_info['Season'] = int(current_season)
-                box_info['Week'] = week['Week Name']
-                box_info['Week Number'] = int(week.split('/week_')[1].split('.htm')[0])
-                box_info['Box Score Link'] = '{0}{1}'.format(url_base,str(a.get('href')))
-                game_data.append(box_info)
-    current_season += 1
-
-df = pd.DataFrame(game_data)
-df = df[['Season', 'Week', 'Week Number', 'Box Score Link']]
-df.to_csv('{0}/game_links_{1}_to_{2}.csv'.format(data_folder,season_start,season_end))
+                week_info['Week Name'] = str(a.text)
+                week_info['Week Link'] = '{0}{1}'.format(url_base,str(a.get('href')))
+                week_links.append(week_info)
+        ## remove duplicates from week_links ##:
+        ## from https://stackoverflow.com/questions/9427163/remove-duplicate-dict-in-list-in-python ##
+        seen_links = []
+        new_link_list = []
+        for d in week_links:
+            t = d['Week Link']
+            if t not in seen_links:
+                seen_links.append(t)
+                new_link_list.append(d)
+        week_links = new_link_list
+        print('   * Found {0} weeks...'.format(len(week_links)))
+        for week in week_links:
+            print('      * Pulling {0} game links'.format(week['Week Name']))
+            time.sleep((.75 + random.random() * 1.5))
+            url = week['Week Link']
+            raw_week = requests.get(url)
+            parsed_week = BeautifulSoup(raw_week.content, 'html.parser')
+            week_anchors = parsed_week.find_all('a',href=True)
+            for a in week_anchors:
+                if '/boxscores/{0}'.format(current_season) in a.get('href') or '/boxscores/{0}'.format(current_season + 1) in a.get('href'):
+                    box_info = {
+                        'Season' : None,
+                        'Week' : None,
+                        'Week Number' : None,
+                        'Box Score Link' : None,
+                    }
+                    box_info['Season'] = int(current_season)
+                    box_info['Week'] = week['Week Name']
+                    #box_info['Week Number'] = int(week.split('/week_')[1].split('.htm')[0])
+                    '''
+                    I (Dennis) was unable to get this line to successfully convert Weeks to Week numbers.
+                    If you can manage to do so, feel free to un-comment this line, and then ditch the
+                    "fix weeks" function later in the pipeline
+                    '''
+                    box_info['Box Score Link'] = '{0}{1}'.format(url_base,str(a.get('href')))
+                    game_data.append(box_info)
+        current_season += 1
+
+    df = pd.DataFrame(game_data)
+    df = df[['Season', 'Week', 'Week Number', 'Box Score Link']]
+    df.to_csv('{0}/game_links_{1}_to_{2}.csv'.format(data_folder,season_start,season_end))
+
+    return
+
+def main():
+    print('Script was run directly, but this doesn\'t do anything!')
+
+if __name__ == '__main__': main()