diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6e4266f --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.py[cod] +*$py.class diff --git a/README.md b/README.md index c503451..b101818 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,39 @@ # pfr_metadata_pull +All of your favorite pfr_metadata_pull code, now in package form! Usage is a simple as: + +1. Clone this repo +2. Make sure your local copy of the repo lives in a directory that's in your PYTHONPATH +3. Open up python in your manner of choice and type the following: +```python +import pfr_metadata_pull as meta + +meta.scrape_links(start_year, end_year, output_path) # creates a file "game_links_startyear_to_endyear.csv" in the 'output_path' directory +meta.pull_data_from_links("game_links_startyear_to_endyear.csv", output_path) # creates a file "game_meta_data.csv" in the 'output_path' directory +meta.fix_weeks("game_meta_data.csv", output_path) # creates a file "game_meta_data_weeks_fixed.csv" in the 'output_path' directory +meta.format_data("game_meta_data_weeks_fixed.csv", output_path) # creates two files in the 'output_path' directory +``` +The final format_data function makes two files - one, "game_meta_data_formatted.csv", is a nice pretty version of the metadata. +The other file, __"game_meta_data_ready_to_merge.csv"__ is what you'll need to add metadata to an existing play-by-play file. +Say you have a file "pbp.csv" that spans some range of seasons, and you just created "game_meta_data_ready_to_merge.csv" for that same range of seasons. Now you can do: +```python +import pandas as pd +pbp = pd.read_csv('pbp.csv') +meta = pd.read_csv('game_meta_data_ready_to_merge.csv') +pbp_meta = pd.merge(pbp, meta, on=['season','week','home_team','away_team'], how='left') +``` +Or maybe you do this part in R: +```R +library(tidyverse) +pbp <- read_csv('pbp.csv') +meta <- read_csv('game_meta_data_ready_to_merge.csv') +pbp_meta <- left_join(pbp, meta, by = c("season", "week", "home_team", "away_team")) +``` + +These changes made by [Dennis Brookner](https://github.com/dennisbrookner); direct concerns to me, or to [Puntalytics](https://twitter.com/ThePuntRunts) on twitter. + +### Original README from greerre + This repo contains the set of scripts used to create the dataset referenced here: https://twitter.com/greerreNFL/status/1146519422527389696 diff --git a/__init__.py b/__init__.py new file mode 100755 index 0000000..3ac1df3 --- /dev/null +++ b/__init__.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue May 26 16:25:40 2020 + +@author: dennisbrookner +""" + +from .pfr_game_link_scraper import scrape_links +from .pfr_meta_data_pull import pull_data_from_links +from .week_name_stopgap import fix_weeks +from .pfr_meta_data_format import format_data \ No newline at end of file diff --git a/pfr_game_link_scraper.py b/pfr_game_link_scraper.py index d7bf50e..eac41f4 100644 --- a/pfr_game_link_scraper.py +++ b/pfr_game_link_scraper.py @@ -12,65 +12,90 @@ import pandas as pd import numpy -data_folder = 'file path to folder where all data will be held...no trailing slash' -season_start = 1960 -season_end = 2018 -current_season = season_start - -url_base = 'https://www.pro-football-reference.com' -game_data = [] - -while current_season <= season_end: - time.sleep((1.5 + random.random() * 2)) - url = '{0}/years/{1}/week_1.htm'.format(url_base,current_season) - print('Requesting weeks for the {0} season...'.format(current_season)) - raw = requests.get(url) - parsed = BeautifulSoup(raw.content, 'html.parser') - all_anchors = parsed.find_all('a',href=True) ## anchors used b/c commenting makes pulling specific divs hard ## - week_links = [] - for a in all_anchors: - if '/years/{0}/week_'.format(current_season) in a.get('href'): - week_info = { - 'Week Name' : None, - 'Week Link' : None, - } - week_info['Week Name'] = str(a.text) - week_info['Week Link'] = '{0}{1}'.format(url_base,str(a.get('href'))) - week_links.append(week_info) - ## remove duplicates from week_links ##: - ## from https://stackoverflow.com/questions/9427163/remove-duplicate-dict-in-list-in-python ## - seen_links = [] - new_link_list = [] - for d in week_links: - t = d['Week Link'] - if t not in seen_links: - seen_links.append(t) - new_link_list.append(d) - week_links = new_link_list - print(' * Found {0} weeks...'.format(len(week_links))) - for week in week_links: - print(' * Pulling {0} game links'.format(week['Week Name'])) - time.sleep((.75 + random.random() * 1.5)) - url = week['Week Link'] - raw_week = requests.get(url) - parsed_week = BeautifulSoup(raw_week.content, 'html.parser') - week_anchors = parsed_week.find_all('a',href=True) - for a in week_anchors: - if '/boxscores/{0}'.format(current_season) in a.get('href') or '/boxscores/{0}'.format(current_season + 1) in a.get('href'): - box_info = { - 'Season' : None, - 'Week' : None, - 'Week Number' : None, - 'Box Score Link' : None, +def scrape_links(start_year, end_year, output_path): + ''' + Start year can be as early as 1960; end year can be as late as the current year. + + Output path should point to the desired location of the metadata, and should NOT have a trailing slash. + + Note that for many seasons, this can be slow! + ''' + data_folder = output_path + + if output_path[-1] == '/': + raise ValueError("Indicated file path includes a trailing slash, please remove it") + + + season_start = start_year + season_end = end_year + current_season = season_start + + url_base = 'https://www.pro-football-reference.com' + game_data = [] + + while current_season <= season_end: + time.sleep((1.5 + random.random() * 2)) + url = '{0}/years/{1}/week_1.htm'.format(url_base,current_season) + print('Requesting weeks for the {0} season...'.format(current_season)) + raw = requests.get(url) + parsed = BeautifulSoup(raw.content, 'html.parser') + all_anchors = parsed.find_all('a',href=True) ## anchors used b/c commenting makes pulling specific divs hard ## + week_links = [] + for a in all_anchors: + if '/years/{0}/week_'.format(current_season) in a.get('href'): + week_info = { + 'Week Name' : None, + 'Week Link' : None, } - box_info['Season'] = int(current_season) - box_info['Week'] = week['Week Name'] - box_info['Week Number'] = int(week.split('/week_')[1].split('.htm')[0]) - box_info['Box Score Link'] = '{0}{1}'.format(url_base,str(a.get('href'))) - game_data.append(box_info) - current_season += 1 - -df = pd.DataFrame(game_data) -df = df[['Season', 'Week', 'Week Number', 'Box Score Link']] -df.to_csv('{0}/game_links_{1}_to_{2}.csv'.format(data_folder,season_start,season_end)) + week_info['Week Name'] = str(a.text) + week_info['Week Link'] = '{0}{1}'.format(url_base,str(a.get('href'))) + week_links.append(week_info) + ## remove duplicates from week_links ##: + ## from https://stackoverflow.com/questions/9427163/remove-duplicate-dict-in-list-in-python ## + seen_links = [] + new_link_list = [] + for d in week_links: + t = d['Week Link'] + if t not in seen_links: + seen_links.append(t) + new_link_list.append(d) + week_links = new_link_list + print(' * Found {0} weeks...'.format(len(week_links))) + for week in week_links: + print(' * Pulling {0} game links'.format(week['Week Name'])) + time.sleep((.75 + random.random() * 1.5)) + url = week['Week Link'] + raw_week = requests.get(url) + parsed_week = BeautifulSoup(raw_week.content, 'html.parser') + week_anchors = parsed_week.find_all('a',href=True) + for a in week_anchors: + if '/boxscores/{0}'.format(current_season) in a.get('href') or '/boxscores/{0}'.format(current_season + 1) in a.get('href'): + box_info = { + 'Season' : None, + 'Week' : None, + 'Week Number' : None, + 'Box Score Link' : None, + } + box_info['Season'] = int(current_season) + box_info['Week'] = week['Week Name'] + #box_info['Week Number'] = int(week.split('/week_')[1].split('.htm')[0]) + ''' + I (Dennis) was unable to get this line to successfully convert Weeks to Week numbers. + If you can manage to do so, feel free to un-comment this line, and then ditch the + "fix weeks" function later in the pipeline + ''' + box_info['Box Score Link'] = '{0}{1}'.format(url_base,str(a.get('href'))) + game_data.append(box_info) + current_season += 1 + + df = pd.DataFrame(game_data) + df = df[['Season', 'Week', 'Week Number', 'Box Score Link']] + df.to_csv('{0}/game_links_{1}_to_{2}.csv'.format(data_folder,season_start,season_end)) + + return + +def main(): + print('Script was run directly, but this doesn\'t do anything!') + +if __name__ == '__main__': main() diff --git a/pfr_meta_data_format.py b/pfr_meta_data_format.py index 8952c1c..3d440d5 100644 --- a/pfr_meta_data_format.py +++ b/pfr_meta_data_format.py @@ -12,11 +12,10 @@ import numpy import math -data_folder = 'file path to folder where all data will be held...no trailing slash' -df_raw = pd.read_csv('{0}/game_meta_data.csv') -df_divisions = pd.read_csv(' file path to divisions.csv') ## this csv is uploaded to the github -df_scraper_game = pd.read_csv(' file path reg_game_all.csv') ## this csv is uploaded to the github +# Define all global variables first +df_divisions = pd.read_csv('https://raw.githubusercontent.com/greerre/pfr_metadata_pull/master/divisions.csv') ## this csv is uploaded to the github; possible this link breaks at some point +df_scraper_game = pd.read_csv('https://raw.githubusercontent.com/greerre/pfr_metadata_pull/master/reg_game_all.csv') ## this csv is uploaded to the github; possible this link breaks at some point pfr_to_pbp_dict = { @@ -61,15 +60,6 @@ } -df_format = df_raw -df_format['Home Team (pfr)'] = df_format['Home Team'] -df_format['Away Team (pfr)'] = df_format['Away Team'] -df_format['Home Team'] = df_format['Home Team'].replace(pfr_to_pbp_dict) -df_format['Away Team'] = df_format['Away Team'].replace(pfr_to_pbp_dict) - -df_divisions_home = df_divisions -df_divisions_away = df_divisions - home_rename_dict = { 'Teams' : 'Home Team', 'Conference' : 'Home Conference', @@ -81,112 +71,6 @@ 'Division' : 'Away Division', } -df_divisions_home = df_divisions_home.rename(columns=home_rename_dict) -df_divisions_away = df_divisions_away.rename(columns=away_rename_dict) - -df_format = pd.merge(df_format,df_divisions_home,on=['Home Team'], how='left') -df_format = pd.merge(df_format,df_divisions_away,on=['Away Team'], how='left') -df_format = df_format.drop(columns=['Unnamed: 0', 'Unnamed: 0_y', 'Unnamed: 0_x']) - -df_format['Divisional Game'] = numpy.where((df_format['Season'] >= 2002) & (df_format['Home Conference'] == df_format['Away Conference']) & (df_format['Home Division'] == df_format['Away Division']),1,0) - - -def url_to_id(url_id): - id = numpy.nan - try: - id = url_id.split('/')[-1].split('.htm')[0] - except: - pass - return id - - -def row_format(row): - ## pull out degrees and wind ## - row['Temperature'] = None - row['Wind'] = None - if row['Roof'] != 'outdoors': - row['Temperature'] = 70 - row['Wind'] = 0 - else: - try: - row['Temperature'] = int(row['Weather'].split(' degrees')[0]) - except: - pass - try: - if 'no wind' in row['Weather']: - row['Wind'] = 0 - else: - row['Wind'] = int(row['Weather'].split(',')[1].split('wind ')[1].split(' mph')[0]) - except: - pass - ## translate vegas line to home line ## - row['Home Spread'] = numpy.nan - row['Total'] = numpy.nan - if row['Vegas Line'] == 'Pick': - row['Home Spread'] = 0 - else: - line_list = row['Vegas Line'].split(' -') - fav = line_list[0] - favored_by = float(line_list[1]) - if fav == row['Home Team (pfr)']: - row['Home Spread'] = favored_by - elif fav == row['Away Team (pfr)']: - row['Home Spread'] = favored_by * -1.0 - else: - row['Home Spread'] = numpy.nan - try: - row['Total'] = float(row['Over/Under'].split(' (')[0]) - except: - pass - ## translate attendance ## - try: - row['Attendance'] = int(row['Attendance'].replace(',','')) - except: - try: - row['Attendance'] = int(row['Attendance']) - except: - row['Attendance'] = numpy.nan - ## translate tosses ## - row['Home Won Toss'] = numpy.nan - row['Deferred'] = numpy.nan - ## for some reason the toss text is read as a float if it's blank and will throw - ## an error on the split. This is handled w/ the try / except - try: - if row['Won Toss'] == numpy.nan: - pass - else: - home_mascot = row['Home Team (pfr)'].split(' ')[-1] - away_mascot = row['Away Team (pfr)'].split(' ')[-1] - winner = row['Won Toss'].split(' (')[0] - if home_mascot == winner: - row['Home Won Toss'] = 1 - if len(row['Won Toss'].split(' (')) > 1: - row['Deferred'] = 1 - else: - row['Deferred'] = 0 - elif away_mascot == winner: - row['Home Won Toss'] = 0 - if len(row['Won Toss'].split(' (')) > 1: - row['Deferred'] = 1 - else: - row['Deferred'] = 0 - else: - pass - except: - pass - ## conevrt urls to ids ## - row['Stadium ID'] = url_to_id(row['Stadium Link']) - row['Home Coach ID'] = url_to_id(row['Home Coach Link']) - row['Away Coach ID'] = url_to_id(row['Away Coach Link']) - row['Home Starting QB ID'] = url_to_id(row['Home Starting QB Link']) - row['Away Starting QB ID'] = url_to_id(row['Away Starting QB Link']) - return row - - - -df_new = df_format.apply(row_format,axis=1) -df_new.to_csv('/Users/robertgreer/Documents/Coding/NFL/pro-football-reference/Data Files/game_meta_data_formatted.csv') - meta_merge_headers = [ 'Season', @@ -229,13 +113,6 @@ def row_format(row): ] -merge_df = df_new[meta_merge_headers] - -## convert header formating to match nflscrapR for the join -## note of caution...the original scraper swapped home and away team name and coaches -## those were swapped back with the header rename dict below -## the scraper has been fixed and the dict below has been swapped back, but neither tested - rename_merge_headers = { 'Season' : 'season', @@ -278,9 +155,6 @@ def row_format(row): } -merge_df = merge_df.rename(columns=rename_merge_headers) - -## prep scrapeR df ## pbp_team_standard_dict = { 'ARI' : 'ARI', @@ -321,14 +195,6 @@ def row_format(row): } - -## standardize team names across data sets ## -df_scraper_game['home_team'] = df_scraper_game['home_team'].replace(pbp_team_standard_dict) -df_scraper_game['away_team'] = df_scraper_game['away_team'].replace(pbp_team_standard_dict) - -## create new_df ## -merged_df = pd.merge(merge_df,df_scraper_game,on=['season','week','home_team','away_team'],how='left') - final_headers = [ 'type', @@ -365,7 +231,7 @@ def row_format(row): 'home_starting_qb', 'away_starting_qb_id', 'home_starting_qb_id', - 'away_won_toss', + #'away_won_toss', # This threw an error for me (Dennis) so I removed it; sorry if you wanted this! 'winner_deferred', 'referee', 'umpire', @@ -377,5 +243,162 @@ def row_format(row): ] -merged_df = merged_df[final_headers] -merged_df.to_csv('{0}/reg_game_w_meta.csv'.format(data_folder)) +# then helper functions: +def url_to_id(url_id): + id = numpy.nan + try: + id = url_id.split('/')[-1].split('.htm')[0] + except: + pass + return id + + +def row_format(row): + ## pull out degrees and wind ## + row['Temperature'] = None + row['Wind'] = None + if row['Roof'] != 'outdoors': + row['Temperature'] = 70 + row['Wind'] = 0 + else: + try: + row['Temperature'] = int(row['Weather'].split(' degrees')[0]) + except: + pass + try: + if 'no wind' in row['Weather']: + row['Wind'] = 0 + else: + row['Wind'] = int(row['Weather'].split(',')[1].split('wind ')[1].split(' mph')[0]) + except: + pass + ## translate vegas line to home line ## + row['Home Spread'] = numpy.nan + row['Total'] = numpy.nan + if row['Vegas Line'] == 'Pick': + row['Home Spread'] = 0 + else: + line_list = row['Vegas Line'].split(' -') + fav = line_list[0] + favored_by = float(line_list[1]) + if fav == row['Home Team (pfr)']: + row['Home Spread'] = favored_by + elif fav == row['Away Team (pfr)']: + row['Home Spread'] = favored_by * -1.0 + else: + row['Home Spread'] = numpy.nan + try: + row['Total'] = float(row['Over/Under'].split(' (')[0]) + except: + pass + ## translate attendance ## + try: + row['Attendance'] = int(row['Attendance'].replace(',','')) + except: + try: + row['Attendance'] = int(row['Attendance']) + except: + row['Attendance'] = numpy.nan + ## translate tosses ## + row['Home Won Toss'] = numpy.nan + row['Deferred'] = numpy.nan + ## for some reason the toss text is read as a float if it's blank and will throw + ## an error on the split. This is handled w/ the try / except + try: + if row['Won Toss'] == numpy.nan: + pass + else: + home_mascot = row['Home Team (pfr)'].split(' ')[-1] + away_mascot = row['Away Team (pfr)'].split(' ')[-1] + winner = row['Won Toss'].split(' (')[0] + if home_mascot == winner: + row['Home Won Toss'] = 1 + if len(row['Won Toss'].split(' (')) > 1: + row['Deferred'] = 1 + else: + row['Deferred'] = 0 + elif away_mascot == winner: + row['Home Won Toss'] = 0 + if len(row['Won Toss'].split(' (')) > 1: + row['Deferred'] = 1 + else: + row['Deferred'] = 0 + else: + pass + except: + pass + ## conevrt urls to ids ## + row['Stadium ID'] = url_to_id(row['Stadium Link']) + row['Home Coach ID'] = url_to_id(row['Home Coach Link']) + row['Away Coach ID'] = url_to_id(row['Away Coach Link']) + row['Home Starting QB ID'] = url_to_id(row['Home Starting QB Link']) + row['Away Starting QB ID'] = url_to_id(row['Away Starting QB Link']) + return row + +# Now the real, for-use function: +def format_data(input_file, output_path): + ''' + Input file should point to the csv created by week_name_stopgap + + Output path should point to the desired location of the metadata, and should NOT have a trailing slash. + ''' + + data_folder = output_path + + df_raw = pd.read_csv(input_file) + + df_format = df_raw + df_format['Home Team (pfr)'] = df_format['Home Team'] + df_format['Away Team (pfr)'] = df_format['Away Team'] + df_format['Home Team'] = df_format['Home Team'].replace(pfr_to_pbp_dict) + df_format['Away Team'] = df_format['Away Team'].replace(pfr_to_pbp_dict) + + df_divisions_home = df_divisions + df_divisions_away = df_divisions + + + df_divisions_home = df_divisions_home.rename(columns=home_rename_dict) + df_divisions_away = df_divisions_away.rename(columns=away_rename_dict) + + df_format = pd.merge(df_format,df_divisions_home,on=['Home Team'], how='left') + df_format = pd.merge(df_format,df_divisions_away,on=['Away Team'], how='left') + df_format = df_format.drop(columns=['Unnamed: 0', 'Unnamed: 0_y', 'Unnamed: 0_x']) + + df_format['Divisional Game'] = numpy.where((df_format['Season'] >= 2002) & (df_format['Home Conference'] == df_format['Away Conference']) & (df_format['Home Division'] == df_format['Away Division']),1,0) + + + df_new = df_format.apply(row_format,axis=1) + df_new.to_csv(f'{data_folder}/game_meta_data_formatted.csv') + + + merge_df = df_new[meta_merge_headers] + + ## convert header formating to match nflscrapR for the join + ## note of caution...the original scraper swapped home and away team name and coaches + ## those were swapped back with the header rename dict below + ## the scraper has been fixed and the dict below has been swapped back, but neither tested + + + merge_df = merge_df.rename(columns=rename_merge_headers) + merge_df.to_csv(f'{data_folder}/game_meta_data_ready_to_merge.csv') + ## prep scrapeR df ## + + + # ## standardize team names across data sets ## + # df_scraper_game['home_team'] = df_scraper_game['home_team'].replace(pbp_team_standard_dict) + # df_scraper_game['away_team'] = df_scraper_game['away_team'].replace(pbp_team_standard_dict) + + # ## create new_df ## + # merged_df = pd.merge(merge_df,df_scraper_game,on=['season','week','home_team','away_team'],how='left') + + + + # merged_df = merged_df[final_headers] + # merged_df.to_csv('{0}/reg_game_w_meta.csv'.format(data_folder)) + + return + +def main(): + print('Script was run directly, but this doesn\'t do anything!') + +if __name__ == '__main__': main() diff --git a/pfr_meta_data_pull.py b/pfr_meta_data_pull.py index 8733cae..a4328ab 100644 --- a/pfr_meta_data_pull.py +++ b/pfr_meta_data_pull.py @@ -8,13 +8,6 @@ import pandas as pd import numpy -data_folder = 'file path to folder where all data will be held...no trailing slash' - -## Pull in URLs by turning data fram into list ## -url_file = '{0}/game_links_1960_to_2018.csv'.format(data_folder) -url_df = pd.read_csv(url_file) -filtered_df = url_df[url_df['Season'] >= 1990] ## hasn't been tested before 1990, but would work in theory ## -urls = filtered_df['Box Score Link'].tolist() ## helper data structures ## @@ -26,7 +19,6 @@ 'May' : 5, 'Jun' : 6, 'Jul' : 7, - 'Jul' : 8, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, @@ -157,198 +149,228 @@ def get_officials_info(officials_div): field_judge = official_name return referee, umpire, down_judge, line_judge, back_judge, side_judge, field_judge +def pull_data_from_links(input_file, output_path, cutoff_year = 1990): + ''' + url file should be that produced by the scrape_links function. + + Output path should point to the desired location of the metadata, and should NOT have a trailing slash. -game_data_rows = [] -broken_box_list = [] + The default cutoff year ensures that if the input url file contains data + from seasons prior to 1990, those will be omitted here due to fears of + incompatibility; feel free to change this parameter at your own risk. + + Note: This function might take a little while to run. It takes ~2 seconds + per game, with ~250 games/season + ''' + + game_data_rows = [] + broken_box_list = [] + + data_folder = output_path -for url in urls: - time.sleep((.75 + random.random() * .5)) - try: - game_data_points = { - 'Game Link' : None, - 'Game Date' : None, - 'Game Day' : None, - 'Local Start Time' : None, - 'Game Length' : None, - 'Stadium' : None, - 'Stadium Link' : None, - 'Attendance' : None, - 'Season': None, - 'Week' : None, - 'Home Team' : None, - 'Away Team' : None, - 'Home Record' : None, - 'Away Record' : None, - 'Home Score' : None, - 'Away Score' : None, - 'Home Coach' : None, - 'Away Coach' : None, - 'Home Coach Link' : None, - 'Away Coach Link' : None, - 'Home Starting QB' : None, - 'Away Starting QB' : None, - 'Home Starting QB Link' : None, - 'Away Starting QB Link' : None, - 'Won Toss' : None, - 'Won Toss (OT)' : None, - 'Roof' : None, - 'Surface' : None, - 'Weather' : None, - 'Vegas Line' : None, - 'Over/Under' : None, - 'Referee' : None, - 'Umpire' : None, - 'Head Linesman / Down Judge' : None, - 'Line Judge' : None, - 'Back Judge' : None, - 'Side Judge' : None, - 'Field Judge' : None, - } - raw = requests.get(url) - parsed = BeautifulSoup(raw.content, 'html.parser') - score_board_divs = parsed.find('div', {'class' : 'scorebox'}).find_all('div', recursive=False) - home_div = score_board_divs[0] - away_div = score_board_divs[1] - meta_div = score_board_divs[2] - away_div_divs = away_div.find_all('div', recursive=False) - away_team = away_div_divs[0].find('a', {'itemprop' : 'name'}).text - try: - away_score = int(away_div_divs[1].find('div').text) - except: - away_score = int(away_div_divs[1].text) - away_record = away_div_divs[2].text - away_coach = away_div_divs[4].find('a').text - away_coach_link = away_div_divs[4].find('a').get('href') - home_div_divs = home_div.find_all('div', recursive=False) - home_team = home_div_divs[0].find('a', {'itemprop' : 'name'}).text - try: - home_score = int(home_div_divs[1].find('div').text) - except: - home_score = int(home_div_divs[1].text) - home_record = home_div_divs[2].text - home_coach = home_div_divs[4].find('a').text - home_coach_link = home_div_divs[4].find('a').get('href') - try: ## pfr's commenting messes up bs4s parsing, so the specific part has to get pulled as text and re-parsed ## - game_info_div_effed = str(parsed.find('div', {'id': 'all_game_info'})) - game_info_div = BeautifulSoup(game_info_div_effed.split('')[0], 'html.parser') - except: - game_info_div = None - try: - home_starter_div_effed = str(parsed.find('div', {'id' : 'all_home_starters'})) - home_starter_div = BeautifulSoup(home_starter_div_effed.split('')[0], 'html.parser') - except: - home_starter_div = None - try: - away_starter_div_effed = str(parsed.find('div', {'id' : 'all_vis_starters'})) - away_starter_div = BeautifulSoup(away_starter_div_effed.split('')[0], 'html.parser') - except: - away_starter_div = None + ## Pull in URLs by turning data frame into list ## + url_df = pd.read_csv(input_file) + filtered_df = url_df[url_df['Season'] >= cutoff_year] ## hasn't been tested before 1990, but would work in theory ## + urls = filtered_df['Box Score Link'].tolist() + + + for url in urls: + print(f'working on {url}') + time.sleep((.75 + random.random() * .5)) try: - officials_div_effed = str(parsed.find('div', {'id' : 'all_officials'})) - officials_div = BeautifulSoup(officials_div_effed.split('')[0], 'html.parser') + game_data_points = { + 'Game Link' : None, + 'Game Date' : None, + 'Game Day' : None, + 'Local Start Time' : None, + 'Game Length' : None, + 'Stadium' : None, + 'Stadium Link' : None, + 'Attendance' : None, + 'Season': None, + 'Week' : None, + 'Home Team' : None, + 'Away Team' : None, + 'Home Record' : None, + 'Away Record' : None, + 'Home Score' : None, + 'Away Score' : None, + 'Home Coach' : None, + 'Away Coach' : None, + 'Home Coach Link' : None, + 'Away Coach Link' : None, + 'Home Starting QB' : None, + 'Away Starting QB' : None, + 'Home Starting QB Link' : None, + 'Away Starting QB Link' : None, + 'Won Toss' : None, + 'Won Toss (OT)' : None, + 'Roof' : None, + 'Surface' : None, + 'Weather' : None, + 'Vegas Line' : None, + 'Over/Under' : None, + 'Referee' : None, + 'Umpire' : None, + 'Head Linesman / Down Judge' : None, + 'Line Judge' : None, + 'Back Judge' : None, + 'Side Judge' : None, + 'Field Judge' : None, + } + raw = requests.get(url) + parsed = BeautifulSoup(raw.content, 'html.parser') + score_board_divs = parsed.find('div', {'class' : 'scorebox'}).find_all('div', recursive=False) + home_div = score_board_divs[0] + away_div = score_board_divs[1] + meta_div = score_board_divs[2] + away_div_divs = away_div.find_all('div', recursive=False) + away_team = away_div_divs[0].find('a', {'itemprop' : 'name'}).text + try: + away_score = int(away_div_divs[1].find('div').text) + except: + away_score = int(away_div_divs[1].text) + away_record = away_div_divs[2].text + away_coach = away_div_divs[4].find('a').text + away_coach_link = away_div_divs[4].find('a').get('href') + home_div_divs = home_div.find_all('div', recursive=False) + home_team = home_div_divs[0].find('a', {'itemprop' : 'name'}).text + try: + home_score = int(home_div_divs[1].find('div').text) + except: + home_score = int(home_div_divs[1].text) + home_record = home_div_divs[2].text + home_coach = home_div_divs[4].find('a').text + home_coach_link = home_div_divs[4].find('a').get('href') + try: ## pfr's commenting messes up bs4s parsing, so the specific part has to get pulled as text and re-parsed ## + game_info_div_effed = str(parsed.find('div', {'id': 'all_game_info'})) + game_info_div = BeautifulSoup(game_info_div_effed.split('')[0], 'html.parser') + except: + game_info_div = None + try: + home_starter_div_effed = str(parsed.find('div', {'id' : 'all_home_starters'})) + home_starter_div = BeautifulSoup(home_starter_div_effed.split('')[0], 'html.parser') + except: + home_starter_div = None + try: + away_starter_div_effed = str(parsed.find('div', {'id' : 'all_vis_starters'})) + away_starter_div = BeautifulSoup(away_starter_div_effed.split('')[0], 'html.parser') + except: + away_starter_div = None + try: + officials_div_effed = str(parsed.find('div', {'id' : 'all_officials'})) + officials_div = BeautifulSoup(officials_div_effed.split('')[0], 'html.parser') + except: + officials_div = None + game_day, game_date, local_start_time, game_length, stadium, stadium_link, attendance = get_meta_data_points(meta_div) + won_toss, won_toss_ot, roof, surface, weather, vegas_line, over_under = get_game_info(game_info_div) + home_qb, home_qb_link = get_qb_info(home_starter_div) + away_qb, away_qb_link = get_qb_info(away_starter_div) + referee, umpire, down_judge, line_judge, back_judge, side_judge, field_judge = get_officials_info(officials_div) + game_data_points['Game Link'] = url + game_data_points['Game Date'] = game_date + game_data_points['Game Day'] = game_day + game_data_points['Local Start Time'] = local_start_time + game_data_points['Game Length'] = game_length + game_data_points['Stadium'] = stadium + game_data_points['Stadium Link'] = stadium_link + game_data_points['Attendance'] = attendance + game_data_points['Season'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Season'] + game_data_points['Week'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Week'] #' Number'] + game_data_points['Home Team'] = home_team + game_data_points['Away Team'] = away_team + game_data_points['Home Record'] = home_record + game_data_points['Away Record'] = away_record + game_data_points['Home Score'] = home_score + game_data_points['Away Score'] = away_score + game_data_points['Home Coach'] = home_coach + game_data_points['Away Coach'] = away_coach + game_data_points['Home Coach Link'] = home_coach_link + game_data_points['Away Coach Link'] = away_coach_link + game_data_points['Home Starting QB'] = home_qb + game_data_points['Away Starting QB'] = away_qb + game_data_points['Home Starting QB Link'] = home_qb_link + game_data_points['Away Starting QB Link'] = away_qb_link + game_data_points['Won Toss'] = won_toss + game_data_points['Won Toss (OT)'] = won_toss_ot + game_data_points['Roof'] = roof + game_data_points['Surface'] = surface + game_data_points['Weather'] = weather + game_data_points['Vegas Line'] = vegas_line + game_data_points['Over/Under'] = over_under + game_data_points['Referee'] = referee + game_data_points['Umpire'] = umpire + game_data_points['Head Linesman / Down Judge'] = down_judge + game_data_points['Line Judge'] = line_judge + game_data_points['Back Judge'] = back_judge + game_data_points['Side Judge'] = side_judge + game_data_points['Field Judge'] = field_judge + game_data_rows.append(game_data_points) except: - officials_div = None - game_day, game_date, local_start_time, game_length, stadium, stadium_link, attendance = get_meta_data_points(meta_div) - won_toss, won_toss_ot, roof, surface, weather, vegas_line, over_under = get_game_info(game_info_div) - home_qb, home_qb_link = get_qb_info(home_starter_div) - away_qb, away_qb_link = get_qb_info(away_starter_div) - referee, umpire, down_judge, line_judge, back_judge, side_judge, field_judge = get_officials_info(officials_div) - game_data_points['Game Link'] = url - game_data_points['Game Date'] = game_date - game_data_points['Game Day'] = game_day - game_data_points['Local Start Time'] = local_start_time - game_data_points['Game Length'] = game_length - game_data_points['Stadium'] = stadium - game_data_points['Stadium Link'] = stadium_link - game_data_points['Attendance'] = attendance - game_data_points['Season'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Season'] - game_data_points['Week'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Week Number'] - game_data_points['Home Team'] = home_team - game_data_points['Away Team'] = away_team - game_data_points['Home Record'] = home_record - game_data_points['Away Record'] = away_record - game_data_points['Home Score'] = home_score - game_data_points['Away Score'] = away_score - game_data_points['Home Coach'] = home_coach - game_data_points['Away Coach'] = away_coach - game_data_points['Home Coach Link'] = home_coach_link - game_data_points['Away Coach Link'] = away_coach_link - game_data_points['Home Starting QB'] = home_qb - game_data_points['Away Starting QB'] = away_qb - game_data_points['Home Starting QB Link'] = home_qb_link - game_data_points['Away Starting QB Link'] = away_qb_link - game_data_points['Won Toss'] = won_toss - game_data_points['Won Toss (OT)'] = won_toss_ot - game_data_points['Roof'] = roof - game_data_points['Surface'] = surface - game_data_points['Weather'] = weather - game_data_points['Vegas Line'] = vegas_line - game_data_points['Over/Under'] = over_under - game_data_points['Referee'] = referee - game_data_points['Umpire'] = umpire - game_data_points['Head Linesman / Down Judge'] = down_judge - game_data_points['Line Judge'] = line_judge - game_data_points['Back Judge'] = back_judge - game_data_points['Side Judge'] = side_judge - game_data_points['Field Judge'] = field_judge - game_data_rows.append(game_data_points) - except: - broken_row = { - 'Season' : None, - 'Week' : None, - 'URL' : None, - } - broken_row['Season'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Season'] - broken_row['Week'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Week'] - broken_row['URL'] = url - broken_box_list.append(broken_row) - print('ROW BROKEN {0}'.format(broken_row)) - - -df = pd.DataFrame(game_data_rows) -df_two = pd.DataFrame(broken_box_list) + broken_row = { + 'Season' : None, + 'Week' : None, + 'URL' : None, + } + broken_row['Season'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Season'] + broken_row['Week'] = filtered_df[filtered_df['Box Score Link'] == url].iloc[0]['Week'] + broken_row['URL'] = url + broken_box_list.append(broken_row) + print('ROW BROKEN {0}'.format(broken_row)) + + + df = pd.DataFrame(game_data_rows) + df_two = pd.DataFrame(broken_box_list) + + + headers = [ + 'Game Link', + 'Game Date', + 'Game Day', + 'Local Start Time', + 'Game Length', + 'Stadium', + 'Stadium Link', + 'Attendance', + 'Season', + 'Week', + 'Home Team', + 'Away Team', + 'Home Record', + 'Away Record', + 'Home Score', + 'Away Score', + 'Home Coach', + 'Away Coach', + 'Home Coach Link', + 'Away Coach Link', + 'Home Starting QB', + 'Away Starting QB', + 'Home Starting QB Link', + 'Away Starting QB Link', + 'Won Toss', + 'Won Toss (OT)', + 'Roof', + 'Surface', + 'Weather', + 'Vegas Line', + 'Over/Under', + 'Referee', + 'Umpire', + 'Head Linesman / Down Judge', + 'Line Judge', + 'Back Judge', + 'Side Judge', + 'Field Judge' + ] + + df = df[headers] + df.to_csv('{0}/game_meta_data.csv'.format(data_folder)) + return -headers = [ - 'Game Link', - 'Game Date', - 'Game Day', - 'Local Start Time', - 'Game Length', - 'Stadium', - 'Stadium Link', - 'Attendance', - 'Season', - 'Week', - 'Home Team', - 'Away Team', - 'Home Record', - 'Away Record', - 'Home Score', - 'Away Score', - 'Home Coach', - 'Away Coach', - 'Home Coach Link', - 'Away Coach Link', - 'Home Starting QB', - 'Away Starting QB', - 'Home Starting QB Link', - 'Away Starting QB Link', - 'Won Toss', - 'Won Toss (OT)', - 'Roof', - 'Surface', - 'Weather', - 'Vegas Line', - 'Over/Under', - 'Referee', - 'Umpire', - 'Head Linesman / Down Judge', - 'Line Judge', - 'Back Judge', - 'Side Judge', - 'Field Judge' -] +def main(): + print('Script was run directly, but this doesn\'t do anything!') + +if __name__ == '__main__': main() -df = df[headers] -df.to_csv('{0}/game_meta_data.csv'.format(data_folder)) diff --git a/week_name_stopgap.py b/week_name_stopgap.py new file mode 100755 index 0000000..7425f22 --- /dev/null +++ b/week_name_stopgap.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue May 26 15:11:02 2020 + +@author: dennisbrookner +""" + +import pandas as pd + +playoff_dict = {"Wild Card": 18, + "Divisional": 19, + "Conf Champ": 20, + "Super Bowl": 21} + +def fix_weeks(input_file, output_path): + ''' + Input file should point to the csv created by pfr_meta_data_pull + + Output path should point to the desired location of the metadata, and should NOT have a trailing slash. + + This function fixes the fact that up until this point in the pipeline, "Week" is still + a string, and needs to be parsed into an integer + ''' + + data = pd.read_csv(input_file) + + week_number= list(data['Week']) + + for i in range(len(week_number)): + if "Week" in week_number[i]: + week_number[i] = int(week_number[i].split('Week ')[1]) + elif week_number[i] in playoff_dict: + week_number[i] = playoff_dict[week_number[i]] + else: + raise ValueError(f'Unexpected week name {week_number[i]}') + + data['Week_name'] = data['Week'] + + data['Week'] = week_number + + data.to_csv(f'{output_path}/game_meta_data_weeks_fixed.csv') + + return + +def main(): + print('Script was run directly, but this doesn\'t do anything!') + +if __name__ == '__main__': main()