From 778e581459b50b65b1618cb9137966bc24569052 Mon Sep 17 00:00:00 2001 From: Alex Scofield Date: Thu, 29 Sep 2022 19:13:59 +0200 Subject: [PATCH 1/6] Added comments in stdata and cleaned up the code. I will continue to clean up the code in order to add more methods in a more elegant manner. --- documentation.txt | 9 ++++++++- src/stgraphs.py | 14 ++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/documentation.txt b/documentation.txt index 994d5b0..bc00017 100644 --- a/documentation.txt +++ b/documentation.txt @@ -10,6 +10,7 @@ count_users: Takes a dictionary as a parameter to specify which users to count. timezone_counter: Returns a dictionary whose keys are the timezones and whose values are the amount of users. Takes optional parameter. number_skills_completed_dict: Returns a dictionary with the amount of skills completed as a key and the number of people who have completed those skills as a value. number_skills_completed_data: Returns information regarding how many skills users complete. +days_tracked_data: Returns useful information regarding the days that users track their data, such as the average amount of days, the standard deviation and certain percentiles. SkillData: order_skills_by_popularity: Returns an ordered dictionary with each skill as a key and its number of completions as a value. It takes the optional user_parameter parameter to generate a list for specific types of users. @@ -18,6 +19,12 @@ list_skills_by_ease: Returns a dictionary with each skill and its completion rat ChallengeData: order_challenges_by_popularity: Returns an ordered dictionary with each challenge as a key and its number of completions as a value. It takes the optional user_parameter parameter to generate a list for specific types of users. +get_challenge_completion_rate: Returns number of people who have completed, are in progress and have started each challenge, as well as the ease of said challenge, measured as a fraction of Completed/Started. +get_challenge_ease: Returns a dictionary with each Challenge and its completion rate. + + + + GRAPH METHODS: @@ -27,7 +34,7 @@ UserGraph: graph_xp_distribution: Returns a graph with a distribution of users' xp, using a logarithmic scale. pie_timezones: Pie chart of the different timezones bar_timezones: Bar chart of the different timezones -## graph_number_skills_completed: To be constructed +graph_number_skills_completed: Bar chart with the number of skills completed. SkillGraph: graph_skills_by_popularity: Returns a horizontal bar chart of the most popular skills. It takes an optional user_parameter parameter to specify which types of users to analyse and an amount parameter, to specify the amount of skills to graph (note, if graph_all is set to True, all skills will be displayed). diff --git a/src/stgraphs.py b/src/stgraphs.py index 6151311..e79bde8 100644 --- a/src/stgraphs.py +++ b/src/stgraphs.py @@ -27,9 +27,16 @@ def pie_timezones(self, user_parameter={}, tight_layout=True) -> None: def bar_timezones(self, user_parameter={}, tight_layout=True) -> None: data = UserData().timezone_counter(parameter=user_parameter) x = data.keys() - y= data.values() + y = data.values() plt.bar(x, y) self.set_plot("Users per timezone", tight_layout) + + def graph_number_skills_completed(self, user_parameter={}, tight_layout=True) -> None: + data = UserData().number_skills_completed_dict(parameter=user_parameter) + x = data.keys() + y = data.values() + plt.bar(x, y) + self.set_plot("Number of skills completed", tight_layout) class SkillGraph(GraphObject): def graph_skills_by_popularity(self, user_parameter={}, amount=10, graph_all=False, tight_layout=True) -> None: @@ -49,9 +56,9 @@ def graph_skills_by_popularity(self, user_parameter={}, amount=10, graph_all=Fal self.set_plot("Skill Popularity", tight_layout) - def graph_skills_by_ease(self, skill_parameter={}, tight_layout=False) -> None: + def graph_skills_by_ease(self, skill_parameter={}, tight_layout=False, amount=10) -> None: data = SkillData().list_skills_by_ease(skill_parameter=skill_parameter) - plt.bar(data.keys(), data.values()) + plt.barh( list(data.keys())[:amount], list(data.values())[:amount]) plt.xlabel("Completion_rate") self.set_plot("Skills by ease", tight_layout=tight_layout) @@ -72,4 +79,3 @@ def graph_challenges_by_popularity(self, user_parameter={}, amount=10, graph_all plt.text(v + 1, i, str(v), color='blue', fontweight='bold') self.set_plot("Challenge Popularity", tight_layout) - From 8f10de647fa2d3926b6a1d86fcc6df8987303a8c Mon Sep 17 00:00:00 2001 From: Alex Scofield Date: Sun, 2 Oct 2022 13:04:22 +0200 Subject: [PATCH 2/6] Added base class for Skills and Challenges as they share most of their methods. --- src/example.py | 33 +++++++++++ src/play.py | 3 + src/stdata.py | 149 +++++++++++++++++++++++++++++++++++-------------- 3 files changed, 142 insertions(+), 43 deletions(-) create mode 100644 src/example.py diff --git a/src/example.py b/src/example.py new file mode 100644 index 0000000..30f96ac --- /dev/null +++ b/src/example.py @@ -0,0 +1,33 @@ +''' +There are two modules that can be used to access the SkillTree data. +We will begin by looking at stdata, which includes a series of methods whose +objective is to perform calculations using the raw data from the data compiling it into +more comprehensible formats. +''' +import stdata + +''' +There are now various ways of proceeding. The way the stdata module is structured is very +simple. There is a class for UserData, one for SkillData and one for ChallengeData. Each of +these is equiped with a wide range of methods, which we must call in order to perform +our analysis on the data. The easiest way of doing so is by creating an instance in place as follows. +''' + +number_users = stdata.UserData().count_users() + +''' +If you're going to call various methods of the UserData class, it might be better to use the +following code. +''' + +userData = stdata.UserData() +number_users = userData.count_users() +timezone_info = userData.timezone_counter() + +''' +In order to make queries more interesting it can be very useful to make use of the parameters +that are available in each of the methods. +''' + +# Completion rate of the skills belonging to the fitness category, where users are in timezone 0 +data = stdata.SkillData().get_skill_completion_rate(skill_parameter={"category":"fitness"}, user_parameter={"timezone":0}) diff --git a/src/play.py b/src/play.py index 326fcea..a747de4 100644 --- a/src/play.py +++ b/src/play.py @@ -1,2 +1,5 @@ import stdata import stgraphs + +oso = stdata.SkillData().order_by_popularity() +print(stdata.SkillData().id_to_title_and_level(oso)) \ No newline at end of file diff --git a/src/stdata.py b/src/stdata.py index a56c0b7..af26214 100644 --- a/src/stdata.py +++ b/src/stdata.py @@ -1,59 +1,93 @@ from pymongo import MongoClient from pymongo.server_api import ServerApi import os +from collections import Counter, OrderedDict +import pandas as pd -class DataObject: - def __init__(self) -> None: - db_user = os.getenv("STDB_USER") - db_password = os.getenv("STDB_PASS") - self.client = MongoClient(f'mongodb+srv://{db_user}:{db_password}@adonis.n0u0i.mongodb.net/Database?retryWrites=true&w=majority', server_api=ServerApi('1')) - self.db = self.client.Database - self.users = self.db.Users - self.challenges = self.db.Challenges - self.items = self.db.Items - self.skills = self.db.Skills - self.tasks = self.db.Tasks +# Useful function to make sense of the raw data +def count_and_order(list_to_order) -> OrderedDict: + return OrderedDict(Counter(list_to_order).most_common()) +# Base class for all the different types of data +class DataObject(): + db_user = os.getenv("STDB_USER") + db_password = os.getenv("STDB_PASS") + client = MongoClient(f'mongodb+srv://{db_user}:{db_password}@adonis.n0u0i.mongodb.net/Database?retryWrites=true&w=majority', server_api=ServerApi('1')) + db = client.Database + users = db.Users + challenges = db.Challenges + items = db.Items + skills = db.Skills + tasks = db.Tasks + + # Run after each call to close the connection with the Database def close(self) -> None: - self.client.close() + DataObject.client.close() + +# Includes methods common to skills and challenges +class ActionData (DataObject): + def __init__(self): + self.data_type = None + self.completed = None + self.find_description = None + + #### NOT QUITE THERE YET. STILL HAVE TO FIGURE OUT HOW TO GO FROM ZIP TO DICT + def id_to_goals(self, dictionary) -> dict: + descriptions = [self.data_type.find_one({"_id":item})["goals"]for item in dictionary] + # return list(zip(descriptions, list(dictionary.values()))) + def order_by_popularity(self, user_parameter={}) -> dict: + # First create a list with the lists of skills that each user has completed and then unpack that list. + list = [user[self.completed] for user in DataObject.users.find(user_parameter)] + total_list = [item for sublist in list for item in sublist] + return count_and_order(total_list) class UserData (DataObject): + # Count total users def count_users(self, parameter={}) -> int: - return len(list(self.users.find(parameter))) + return len(list(DataObject.users.find(parameter))) - def timezone_counter(self, parameter={}) -> dict: - from collections import Counter, OrderedDict - time_zone_list = [] - users = self.users.find(parameter) - for user in users: - time_zone_list.append(str(user["timezone"])) - timezone_dict = OrderedDict(Counter(time_zone_list).most_common()) - return timezone_dict + # Count users per timezone + def timezone_counter(self, parameter={}) -> OrderedDict: + # First create a list with the timezones each user has, then apply a Counter to it, and then package it all into an Ordered Dict + return count_and_order([str(user["timezone"]) for user in DataObject.users.find(parameter)]) - def number_skills_completed_dict(self, parameter={}) -> dict: - from collections import Counter - data = [len(user["skillscompleted"]) for user in self.users.find(parameter)] - final_dict = Counter(data) - return final_dict + # Dictonary with number of skills users have completed + def number_skills_completed_dict(self, parameter={}) -> OrderedDict: + return count_and_order([len(user["skillscompleted"]) for user in self.users.find(parameter)]) + # Describe the skills completed data def number_skills_completed_data(self, parameter={}) -> str: - import pandas as pd - data = [len(user["skillscompleted"]) for user in self.users.find(parameter)] - total = pd.Series(data).describe() - return total + return pd.Series([len(user["skillscompleted"]) for user in self.users.find(parameter)]).describe() + + # Describe the days tracked data + def days_tracked_data(self, parameter={}) -> str: + return pd.Series([user["numDaysTracked"] for user in self.users.find(parameter)]).describe() + +# SkillData object, inheriting from DataObject + + +### REWRITING ALL METHODS SUCH THAT THE RETURN IS IN TERMS OF ID. THAT WAY VARIOUS WAYS OF RETURNING DATA WITH EXTRA METHODS +class SkillData(ActionData): + def __init__(self): + super().__init__() + self.data_type = DataObject.skills + self.completed = "skillscompleted" -class SkillData(DataObject): + def id_to_title_and_level(self, dictionary) -> dict: + title_and_id = [(self.data_type.find_one({"_id":item})["title"], self.data_type.find_one({"_id":item})["level"]) for item in dictionary] + return dict(zip(title_and_id, dictionary.values())) + + ## REWRITING THIS METHOD def order_skills_by_popularity(self, user_parameter={}) -> list: - from collections import Counter, OrderedDict - total_list = [] - users = self.users.find(user_parameter) - for user in users: - for skill in user["skillscompleted"]: - total_list.append(skill) - total_dictionary = OrderedDict(Counter(total_list).most_common()) + # First create a list with the lists of skills that each user has completed and then unpack that list. + skill_list = [user["skillscompleted"] for user in self.users.find(user_parameter)] + + ### FIX TOTAL_LIST (FOR THE MOMENT IT RETURNS SKILL_LIST). USE INDECES + total_list = [skill for skill in skill_list] + total_dictionary = count_and_order(total_list) skills = total_dictionary.keys() skill_descriptions = [self.skills.find_one({"_id":skill})["goals"] for skill in skills] @@ -73,7 +107,7 @@ def get_skill_completion_rate(self, user_parameter={}, skill_parameter={}) -> di if completed in skills: completed_list.append(completed) for progress in user["skillsinprogress"]: - if completed in skills: + if progress in skills: progress_list.append(progress) completed_counted = Counter(completed_list) @@ -85,15 +119,14 @@ def get_skill_completion_rate(self, user_parameter={}, skill_parameter={}) -> di def list_skills_by_ease(self, skill_parameter={}) -> dict: data = self.get_skill_completion_rate(skill_parameter=skill_parameter) - keys = [self.skills.find_one({"_id":id})["title"] for id in data.keys()] + keys = [self.skills.find_one({"_id":id})["goals"][0] for id in data.keys()] values = [value['Score'] for value in data.values()] total_dict = dict(zip(keys, values)) return total_dict -class ChallengeData(DataObject): +class ChallengeData(ActionData): def order_challenges_by_popularity(self, user_parameter={}) -> list: - from collections import Counter, OrderedDict total_list = [] users = self.users.find(user_parameter) for user in users: @@ -107,4 +140,34 @@ def order_challenges_by_popularity(self, user_parameter={}) -> list: title_count = dict(zip(challenge_descriptions, total_dictionary.values())) return title_count - \ No newline at end of file + + def get_challenge_completion_rate(self, user_parameter={}, challenge_parameter={}) -> dict: + users = self.users.find(user_parameter) + challenges = [challenge["_id"] for challenge in self.challenges.find(challenge_parameter)] + completed_list = [] + progress_list = [] + + for user in users: + for completed in user["challengescompleted"]: + if completed in challenges: + completed_list.append(completed) + for progress in user["challengesinprogress"]: + if progress in challenges: + progress_list.append(progress) + + completed_counted = Counter(completed_list) + progress_counted = Counter(progress_list) + data_unordered = {key: {'Started': value + completed_counted[key], 'Progress': value, 'Completed': completed_counted[key], 'Score':float(completed_counted[key])/float(value+completed_counted[key])} for (key, value) in progress_counted.items()} + data_ordered = dict(sorted(data_unordered.items(), key=lambda x:x[1]['Score'])) + + return data_ordered + + def get_challenge_ease(self, challenge_parameter={}) -> dict: + data = self.get_challenge_completion_rate(challenge_parameter=challenge_parameter) + keys = [self.challenges.find_one({"_id":id})["goals"][0] for id in data.keys()] + values = [value['Score'] for value in data.values()] + total_dict = dict(zip(keys, values)) + return total_dict + + +print(SkillData().id_to_goals(SkillData().order_by_popularity())) From 5b49812c6fc8e550cf3d211950c79969c2b442bc Mon Sep 17 00:00:00 2001 From: Alex Scofield Date: Sun, 11 Jun 2023 21:00:04 +0200 Subject: [PATCH 3/6] New refactored version. --- README.md | 19 +- documentation.txt | 44 -- requirements.txt | 3 +- {src => src (outdated)}/__init__.py | 0 {src => src (outdated)}/example.py | 1 + src (outdated)/gui.py | 31 ++ src (outdated)/play.py | 4 + src (outdated)/stdata.py | 129 ++++++ {src => src (outdated)}/stgraphs.py | 0 src/outlier_analysis.ipynb | 601 ++++++++++++++++++++++++++++ src/play.py | 5 - src/stdata.py | 173 -------- src/utilities/__init__.py | 0 src/utilities/data.py | 53 +++ src/utilities/skills.py | 37 ++ src/utilities/users.py | 45 +++ 16 files changed, 918 insertions(+), 227 deletions(-) delete mode 100644 documentation.txt rename {src => src (outdated)}/__init__.py (100%) rename {src => src (outdated)}/example.py (99%) create mode 100644 src (outdated)/gui.py create mode 100644 src (outdated)/play.py create mode 100644 src (outdated)/stdata.py rename {src => src (outdated)}/stgraphs.py (100%) create mode 100644 src/outlier_analysis.ipynb delete mode 100644 src/play.py delete mode 100644 src/stdata.py create mode 100644 src/utilities/__init__.py create mode 100644 src/utilities/data.py create mode 100644 src/utilities/skills.py create mode 100644 src/utilities/users.py diff --git a/README.md b/README.md index f730108..0bfcda3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,16 @@ -# Skill-Tree-Data-Analytics -A python program to compile useful insights from users' skill tree data. -# Installation +# Skill-Tree Data-Analytics +Python library that streamlines the process of data analysis for Project Skill Tree. + +## Installation To install the necessary dependencies run the command pip install -r requirements.txt -To access the Database you must use a Database User and a Database Password, stored as enviornment variables as "STDB_USER" and "STDB_PASSWORD" respectively. \ No newline at end of file + +To access the Database you must use a Database User and a Database Password, stored as enviornment variables as "STDB_USER" and "STDB_PASSWORD" respectively. + +## Usage +There are two parts to the repository: +1) The utilities package. +2) Jupyter notebooks on which the Data Analysis takes place. + +The utilities package contains methods that process the data. There is a pandas DataFrame for each type of data, which can be manipulated as needed in the notebooks. + +Currently the old version of the source code is still in the repository, until refactoring is completed. \ No newline at end of file diff --git a/documentation.txt b/documentation.txt deleted file mode 100644 index bc00017..0000000 --- a/documentation.txt +++ /dev/null @@ -1,44 +0,0 @@ -Here is a small reference of the methods to easily access and visualise the SkillTree data. - -## Marks methods that currently present issues - - -DATA METHODS: - -UserData: -count_users: Takes a dictionary as a parameter to specify which users to count. If left empty, it will return the total number of SkillTree users. -timezone_counter: Returns a dictionary whose keys are the timezones and whose values are the amount of users. Takes optional parameter. -number_skills_completed_dict: Returns a dictionary with the amount of skills completed as a key and the number of people who have completed those skills as a value. -number_skills_completed_data: Returns information regarding how many skills users complete. -days_tracked_data: Returns useful information regarding the days that users track their data, such as the average amount of days, the standard deviation and certain percentiles. - -SkillData: -order_skills_by_popularity: Returns an ordered dictionary with each skill as a key and its number of completions as a value. It takes the optional user_parameter parameter to generate a list for specific types of users. -get_skill_completion_rate: Returns an ordered dictionary whose key is the id of each skill, and whose value is a dictionary containing the amount of users that have started, that are in progress and that have completed each skill, along with a score (indicating, from 0 to 1, the completion rate of the skill). Accepts parameters for skill and users. -list_skills_by_ease: Returns a dictionary with each skill and its completion rate - -ChallengeData: -order_challenges_by_popularity: Returns an ordered dictionary with each challenge as a key and its number of completions as a value. It takes the optional user_parameter parameter to generate a list for specific types of users. -get_challenge_completion_rate: Returns number of people who have completed, are in progress and have started each challenge, as well as the ease of said challenge, measured as a fraction of Completed/Started. -get_challenge_ease: Returns a dictionary with each Challenge and its completion rate. - - - - - - -GRAPH METHODS: -Note: for all graphs there exists a parameter called tight_layout, which, if set to False deactivates layout optimisation. This can be used if the plots are not displaying properly. - -UserGraph: -graph_xp_distribution: Returns a graph with a distribution of users' xp, using a logarithmic scale. -pie_timezones: Pie chart of the different timezones -bar_timezones: Bar chart of the different timezones -graph_number_skills_completed: Bar chart with the number of skills completed. - -SkillGraph: -graph_skills_by_popularity: Returns a horizontal bar chart of the most popular skills. It takes an optional user_parameter parameter to specify which types of users to analyse and an amount parameter, to specify the amount of skills to graph (note, if graph_all is set to True, all skills will be displayed). -## graph_skills_by_ease: Graphs skills by ease. Can take parameter. - -ChallengeGraph: -graph_challenges_by_popularity: Returns a horizontal bar chart of the most popular challenges. It takes an optional user_parameter parameter to specify which types of users to analyse and an amount parameter, to specify the amount of skills to graph (note, if graph_all is set to True, all skills will be displayed). diff --git a/requirements.txt b/requirements.txt index ed3a068..c9b0494 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ matplotlib pymongo pymongo[srv] -pandas \ No newline at end of file +pandas +seaborn \ No newline at end of file diff --git a/src/__init__.py b/src (outdated)/__init__.py similarity index 100% rename from src/__init__.py rename to src (outdated)/__init__.py diff --git a/src/example.py b/src (outdated)/example.py similarity index 99% rename from src/example.py rename to src (outdated)/example.py index 30f96ac..b1c7e93 100644 --- a/src/example.py +++ b/src (outdated)/example.py @@ -31,3 +31,4 @@ # Completion rate of the skills belonging to the fitness category, where users are in timezone 0 data = stdata.SkillData().get_skill_completion_rate(skill_parameter={"category":"fitness"}, user_parameter={"timezone":0}) +print(data) \ No newline at end of file diff --git a/src (outdated)/gui.py b/src (outdated)/gui.py new file mode 100644 index 0000000..c4bd3e9 --- /dev/null +++ b/src (outdated)/gui.py @@ -0,0 +1,31 @@ +from stdata import * +from stgraphs import * +from PyQt6.QtWidgets import * +import sys + +# Definition of PyQt App, Layout and Window +app = QApplication(sys.argv) + + +class MainWin(QMainWindow): + def __init__(self): + super().__init__() + self.button = QPushButton('Top') + self.button.clicked.connect(self.show_new_window) + self.setCentralWidget(self.button) + + def show_new_window(self, checked): + self.w = SkillWin() + self.w.show() + +class SkillWin(QWidget): + def __init__(self): + super().__init__() + layout = QVBoxLayout() + label = QLabel(str(SkillData().get_ease())) + layout.addWidget(label) + self.setLayout(layout) + +mainWin = MainWin() +mainWin.show() +app.exec() diff --git a/src (outdated)/play.py b/src (outdated)/play.py new file mode 100644 index 0000000..c35bfa7 --- /dev/null +++ b/src (outdated)/play.py @@ -0,0 +1,4 @@ +from stdata import * +import stgraphs + +stgraphs.SkillGraph().graph_skills_by_ease() \ No newline at end of file diff --git a/src (outdated)/stdata.py b/src (outdated)/stdata.py new file mode 100644 index 0000000..0b81510 --- /dev/null +++ b/src (outdated)/stdata.py @@ -0,0 +1,129 @@ +from pymongo import MongoClient +from pymongo.server_api import ServerApi +import os +from collections import Counter, OrderedDict +import pandas as pd + +################ CREATE GET SCV + + +# Useful function to make sense of the raw data +def count_and_order(list_to_order) -> OrderedDict: + return OrderedDict(Counter(list_to_order).most_common()) + +# Base class for all the different types of data +class DataObject(): + db_user = os.getenv("STDB_USER") + db_password = os.getenv("STDB_PASS") + client = MongoClient(f'mongodb+srv://{db_user}:{db_password}@adonis.n0u0i.mongodb.net/Database?retryWrites=true&w=majority', server_api=ServerApi('1')) + db = client.Database + users = db.Users + challenges = db.Challenges + items = db.Items + skills = db.Skills + tasks = db.Tasks + + # Run after each call to close the connection with the Database + def close(self) -> None: + DataObject.client.close() + +# Includes methods common to skills and challenges +class ActionData (DataObject): + def __init__(self): + self.data_type = None + self.completed = None + self.progress = None + self.find_description = None + + def id_to_goals(self, dictionary) -> dict: + descriptions = [self.data_type.find_one({"_id":item})["goals"]for item in dictionary] + return list(zip(descriptions, list(dictionary.values()))) + + def order_by_popularity(self, user_parameter={}) -> dict: + # First create a list with the lists of skills that each user has completed and then unpack that list. + list = [user[self.completed] for user in DataObject.users.find(user_parameter)] + total_list = [item for sublist in list for item in sublist] + return count_and_order(total_list) + + def get_completion_rate(self, user_parameter={}, action_parameter={}) -> dict: + from collections import Counter + users = self.users.find(user_parameter) + items = [item["_id"] for item in self.data_type.find(action_parameter)] + completed_list = [] + progress_list = [] + + for user in users: + for completed in user[self.completed]: + if completed in items: + completed_list.append(completed) + for progress in user[self.progress]: + if progress in items: + progress_list.append(progress) + + completed_counted = Counter(completed_list) + progress_counted = Counter(progress_list) + data_unordered = {key: {'Started': value + completed_counted[key], 'Progress': value, 'Completed': completed_counted[key], 'Score':float(completed_counted[key])/float(value+completed_counted[key])} for (key, value) in progress_counted.items()} + data_ordered = dict(sorted(data_unordered.items(), key=lambda x:x[1]['Score'])) + + return data_ordered + + def get_ease(self, action_parameter={}) -> dict: + data = self.get_completion_rate(action_parameter=action_parameter) + keys = [self.data_type.find_one({"_id":id})["goals"][0] for id in data.keys()] + values = [value['Score'] for value in data.values()] + total_dict = dict(zip(keys, values)) + return total_dict + + +class UserData (DataObject): + def count_users(self, parameter={}) -> int: + return len(list(DataObject.users.find(parameter))) + + def timezone_counter(self, parameter={}) -> OrderedDict: + return count_and_order([str(user["timezone"]) for user in DataObject.users.find(parameter)]) + + def number_skills_completed_dict(self, parameter={}) -> OrderedDict: + return count_and_order([len(user["skillscompleted"]) for user in self.users.find(parameter)]) + + def number_skills_completed_data(self, parameter={}) -> str: + return pd.Series([len(user["skillscompleted"]) for user in self.users.find(parameter)]).describe() + + def days_tracked_data(self, parameter={}) -> str: + return pd.Series([user["numDaysTracked"] for user in self.users.find(parameter)]).describe() + + +class SkillData(ActionData): + def __init__(self): + super().__init__() + self.data_type = DataObject.skills + self.completed = "skillscompleted" + self.progress = "skillsinprogress" + + def id_to_title_and_level(self, dictionary) -> dict: + title_and_id = [(self.data_type.find_one({"_id":item})["title"], self.data_type.find_one({"_id":item})["level"]) for item in dictionary] + return dict(zip(title_and_id, dictionary.values())) + + def get_skills_csv(self) -> None: + import csv + data = self.get_completion_rate() + titles = list(self.id_to_title_and_level(data).keys()) + goals = [item[0] for item in self.id_to_goals(data)] + started = [data[datum]["Started"] for datum in data] + progress = [data[datum]["Progress"] for datum in data] + completed = [data[datum]["Completed"] for datum in data] + score = [data[datum]["Score"] for datum in data] + rows = [[titles[i], goals[i], started[i], progress[i], completed[i], score[i]] for i in range(len(titles))] + + with open('skills.csv', 'w', encoding='UTF8') as f: + writer = csv.writer(f, delimiter=';') + for row in rows: + writer.writerow(row) + +class ChallengeData(ActionData): + def __init__(self): + super().__init__() + self.data_type = DataObject.challenges + self.completed = "challengescompleted" + self.progress = "challengesinprogress" + +SkillData().get_skills_csv() \ No newline at end of file diff --git a/src/stgraphs.py b/src (outdated)/stgraphs.py similarity index 100% rename from src/stgraphs.py rename to src (outdated)/stgraphs.py diff --git a/src/outlier_analysis.ipynb b/src/outlier_analysis.ipynb new file mode 100644 index 0000000..a298f85 --- /dev/null +++ b/src/outlier_analysis.ipynb @@ -0,0 +1,601 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Outlier Users\n", + "In this notebook we will analyse outliers in the dataset. These come in various shapes and forms. We will look at 3 types of outliers. \n", + "1) Those with negative *xp*.\n", + "2) Those with very high *xp*.\n", + "3) Those whose character is neither *male* nor *female*." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import utilities.data as ud\n", + "import utilities.users as uu\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "DATA_DIR = \"./data\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Execute only if you want to fetch the data from the Database.\n", + "ud.fetch_data(DATA_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "users, challenges, items, skills, tasks = ud.read_data(DATA_DIR)\n", + "\n", + "# We are only interested in active users.\n", + "users = uu.process(uu.active(users))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analysis" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we will have a look at users with less than 0 xp points. The existence of such users suggests a bug in the system, possibly caused by a player unmarking a skill as complete." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idxpxpHistoryitemsskillscompletedskillsinprogresschallengescompletedchallengesinprogresscharactertimezonebaselocationlastTrackednumDaysTrackedreminderSent
466363e598ddb0f5c03625155c75-60[0][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[][ObjectId('62c226cf9efefadfd10e20ad'), ObjectI...[][]male0.010543760219751138692023-02-15 03:32:30.21161
\n", + "
" + ], + "text/plain": [ + " _id xp xpHistory \\\n", + "4663 63e598ddb0f5c03625155c75 -60 [0] \n", + "\n", + " items skillscompleted \\\n", + "4663 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... [] \n", + "\n", + " skillsinprogress challengescompleted \\\n", + "4663 [ObjectId('62c226cf9efefadfd10e20ad'), ObjectI... [] \n", + "\n", + " challengesinprogress character timezone baselocation \\\n", + "4663 [] male 0.0 1054376021975113869 \n", + "\n", + " lastTracked numDaysTracked reminderSent \n", + "4663 2023-02-15 03:32:30.211 6 1 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users[users[\"xp\"]<0]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the data for the 0.1% top users." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idxpxpHistoryitemsskillscompletedskillsinprogresschallengescompletedchallengesinprogresscharactertimezonebaselocationlastTrackednumDaysTrackedreminderSent
38162c9e671c6fc4a6d588902dc166390[0, 0, 2420, 11020, 16640, 27340, 38740, 43840...[ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20ad'), ObjectI...[ObjectId('62c226d69efefadfd10e2167'), ObjectI...[ObjectId('62c226d09efefadfd10e20bb'), ObjectI...[]male-4.09539241922591703352023-04-01 09:26:19.6371491
40662cbb64d17466f8557f81ee5138860[0, 5190, 8840, 12340, 12340, 15940, 23640, 33...[ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20ad'), ObjectI...[ObjectId('62c226d69efefadfd10e216e'), ObjectI...[ObjectId('62c226d09efefadfd10e20bb'), ObjectI...[]male-8.03338051417465159802023-06-10 22:16:02.511364-1
93662f928203ab35244f0edb52b214050[0, 0, 0, 560, 2660, 6110, 9130, 15480, 22480,...[ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20b2'), ObjectI...[ObjectId('62c226d19efefadfd10e20d6')][ObjectId('62c226d09efefadfd10e20bb'), ObjectI...[]male0.09758595729694556462023-06-11 18:36:20.631234-1
140362fec820f73481669ecc9eb1137150[0, 0, 340, 1440, 6290, 10680, 13580, 14980, 2...[ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20b2'), ObjectI...[][ObjectId('62c226df9efefadfd10e2242'), ObjectI...[]male-6.09539241922591703352023-04-10 01:01:35.8874751
\n", + "
" + ], + "text/plain": [ + " _id xp \\\n", + "381 62c9e671c6fc4a6d588902dc 166390 \n", + "406 62cbb64d17466f8557f81ee5 138860 \n", + "936 62f928203ab35244f0edb52b 214050 \n", + "1403 62fec820f73481669ecc9eb1 137150 \n", + "\n", + " xpHistory \\\n", + "381 [0, 0, 2420, 11020, 16640, 27340, 38740, 43840... \n", + "406 [0, 5190, 8840, 12340, 12340, 15940, 23640, 33... \n", + "936 [0, 0, 0, 560, 2660, 6110, 9130, 15480, 22480,... \n", + "1403 [0, 0, 340, 1440, 6290, 10680, 13580, 14980, 2... \n", + "\n", + " items \\\n", + "381 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "406 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "936 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "1403 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "\n", + " skillscompleted \\\n", + "381 [ObjectId('62c226cf9efefadfd10e20ad'), ObjectI... \n", + "406 [ObjectId('62c226cf9efefadfd10e20ad'), ObjectI... \n", + "936 [ObjectId('62c226cf9efefadfd10e20b2'), ObjectI... \n", + "1403 [ObjectId('62c226cf9efefadfd10e20b2'), ObjectI... \n", + "\n", + " skillsinprogress \\\n", + "381 [ObjectId('62c226d69efefadfd10e2167'), ObjectI... \n", + "406 [ObjectId('62c226d69efefadfd10e216e'), ObjectI... \n", + "936 [ObjectId('62c226d19efefadfd10e20d6')] \n", + "1403 [] \n", + "\n", + " challengescompleted challengesinprogress \\\n", + "381 [ObjectId('62c226d09efefadfd10e20bb'), ObjectI... [] \n", + "406 [ObjectId('62c226d09efefadfd10e20bb'), ObjectI... [] \n", + "936 [ObjectId('62c226d09efefadfd10e20bb'), ObjectI... [] \n", + "1403 [ObjectId('62c226df9efefadfd10e2242'), ObjectI... [] \n", + "\n", + " character timezone baselocation lastTracked \\\n", + "381 male -4.0 953924192259170335 2023-04-01 09:26:19.637 \n", + "406 male -8.0 333805141746515980 2023-06-10 22:16:02.511 \n", + "936 male 0.0 975859572969455646 2023-06-11 18:36:20.631 \n", + "1403 male -6.0 953924192259170335 2023-04-10 01:01:35.887 \n", + "\n", + " numDaysTracked reminderSent \n", + "381 149 1 \n", + "406 364 -1 \n", + "936 234 -1 \n", + "1403 475 1 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_users = users[users[\"xp\"]>users[\"xp\"].quantile(0.999)]\n", + "top_users" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will now calculate the coefficient of variation $C_v=\\frac{\\sigma}{\\mu}$, for the top users. As a reminder, a higher value of $C_v$ corresponds to higher variation in the dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The coefficient of variation is 0.2187\n" + ] + } + ], + "source": [ + "cv = uu.coeff_variation(top_users, \"xp\")\n", + "print(f\"The coefficient of variation is {cv:.4f}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will also calculate the coefficient of variation of the entire dataset. A high value (over 1) here indicates that the data is very spread out." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The coefficient of variation is 3.5567\n" + ] + } + ], + "source": [ + "cv = uu.coeff_variation(users, \"xp\")\n", + "print(f\"The coefficient of variation is {cv:.4f}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A good way to quantify exactly how extreme their *xp* level is in comparison to other active users is to look at how many *Standard deviations* they stray away from the mean. For refference, if the data was normally distributed, we'd expect 99.7% of the data to be within 3 standards deviations of the mean." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The user at poisition number 1 is 18.359 stds away from the mean.\n", + "The user at poisition number 2 is 14.208 stds away from the mean.\n", + "The user at poisition number 3 is 11.811 stds away from the mean.\n", + "The user at poisition number 4 is 11.662 stds away from the mean.\n" + ] + } + ], + "source": [ + "top_users = top_users.sort_values(\"xp\", ascending=False)\n", + "\n", + "j = 0\n", + "\n", + "for i, user in top_users.iterrows():\n", + " j += 1\n", + " v = abs(user[\"xp\"]-users[\"xp\"].mean())/users[\"xp\"].std()\n", + " print(f\"The user at poisition number {j} is {v:.3f} stds away from the mean.\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another way to view the outliers in the dataset is to view the *xp* level and the number of days tracked. This lets us see two dimensions of outliers. Those who have been using SkillTree consistently for a very long time, and those who have high *xp*. It is also important to note that those users who have a high *xp* level and low number of tracked days are \"intense users\" in the sense that they have been able to progress enormously in a short amount of time." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.title(\"XP / Days Tracked\")\n", + "plt.scatter(users[\"numDaysTracked\"], users[\"xp\"], c=users[\"reminderSent\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally we will look at the last type of outlier, the users who have the gender of the character set to neither male nor female." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idxpxpHistoryitemsskillscompletedskillsinprogresschallengescompletedchallengesinprogresscharactertimezonebaselocationlastTrackednumDaysTrackedreminderSent
14956301810ef73481669ecd49a81900[0, 0, 500, 900, 900, 1600, 1900, 1900][ObjectId('62c226d09efefadfd10e20c6'), ObjectI...[ObjectId('62c226cf9efefadfd10e20ad'), ObjectI...[ObjectId('62c226d19efefadfd10e20d9'), ObjectI...[][ObjectId('62c226df9efefadfd10e2242'), ObjectI...ok-6.04972224881457889292022-09-29 21:57:05.492311
\n", + "
" + ], + "text/plain": [ + " _id xp xpHistory \\\n", + "1495 6301810ef73481669ecd49a8 1900 [0, 0, 500, 900, 900, 1600, 1900, 1900] \n", + "\n", + " items \\\n", + "1495 [ObjectId('62c226d09efefadfd10e20c6'), ObjectI... \n", + "\n", + " skillscompleted \\\n", + "1495 [ObjectId('62c226cf9efefadfd10e20ad'), ObjectI... \n", + "\n", + " skillsinprogress challengescompleted \\\n", + "1495 [ObjectId('62c226d19efefadfd10e20d9'), ObjectI... [] \n", + "\n", + " challengesinprogress character timezone \\\n", + "1495 [ObjectId('62c226df9efefadfd10e2242'), ObjectI... ok -6.0 \n", + "\n", + " baselocation lastTracked numDaysTracked \\\n", + "1495 497222488145788929 2022-09-29 21:57:05.492 31 \n", + "\n", + " reminderSent \n", + "1495 1 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users[(users[\"character\"] != \"male\") & (users[\"character\"] != \"female\")]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/play.py b/src/play.py deleted file mode 100644 index a747de4..0000000 --- a/src/play.py +++ /dev/null @@ -1,5 +0,0 @@ -import stdata -import stgraphs - -oso = stdata.SkillData().order_by_popularity() -print(stdata.SkillData().id_to_title_and_level(oso)) \ No newline at end of file diff --git a/src/stdata.py b/src/stdata.py deleted file mode 100644 index af26214..0000000 --- a/src/stdata.py +++ /dev/null @@ -1,173 +0,0 @@ -from pymongo import MongoClient -from pymongo.server_api import ServerApi -import os -from collections import Counter, OrderedDict -import pandas as pd - - -# Useful function to make sense of the raw data -def count_and_order(list_to_order) -> OrderedDict: - return OrderedDict(Counter(list_to_order).most_common()) - -# Base class for all the different types of data -class DataObject(): - db_user = os.getenv("STDB_USER") - db_password = os.getenv("STDB_PASS") - client = MongoClient(f'mongodb+srv://{db_user}:{db_password}@adonis.n0u0i.mongodb.net/Database?retryWrites=true&w=majority', server_api=ServerApi('1')) - db = client.Database - users = db.Users - challenges = db.Challenges - items = db.Items - skills = db.Skills - tasks = db.Tasks - - # Run after each call to close the connection with the Database - def close(self) -> None: - DataObject.client.close() - -# Includes methods common to skills and challenges -class ActionData (DataObject): - def __init__(self): - self.data_type = None - self.completed = None - self.find_description = None - - #### NOT QUITE THERE YET. STILL HAVE TO FIGURE OUT HOW TO GO FROM ZIP TO DICT - def id_to_goals(self, dictionary) -> dict: - descriptions = [self.data_type.find_one({"_id":item})["goals"]for item in dictionary] - # return list(zip(descriptions, list(dictionary.values()))) - - def order_by_popularity(self, user_parameter={}) -> dict: - # First create a list with the lists of skills that each user has completed and then unpack that list. - list = [user[self.completed] for user in DataObject.users.find(user_parameter)] - total_list = [item for sublist in list for item in sublist] - return count_and_order(total_list) - -class UserData (DataObject): - # Count total users - def count_users(self, parameter={}) -> int: - return len(list(DataObject.users.find(parameter))) - - # Count users per timezone - def timezone_counter(self, parameter={}) -> OrderedDict: - # First create a list with the timezones each user has, then apply a Counter to it, and then package it all into an Ordered Dict - return count_and_order([str(user["timezone"]) for user in DataObject.users.find(parameter)]) - - # Dictonary with number of skills users have completed - def number_skills_completed_dict(self, parameter={}) -> OrderedDict: - return count_and_order([len(user["skillscompleted"]) for user in self.users.find(parameter)]) - - # Describe the skills completed data - def number_skills_completed_data(self, parameter={}) -> str: - return pd.Series([len(user["skillscompleted"]) for user in self.users.find(parameter)]).describe() - - # Describe the days tracked data - def days_tracked_data(self, parameter={}) -> str: - return pd.Series([user["numDaysTracked"] for user in self.users.find(parameter)]).describe() - -# SkillData object, inheriting from DataObject - - -### REWRITING ALL METHODS SUCH THAT THE RETURN IS IN TERMS OF ID. THAT WAY VARIOUS WAYS OF RETURNING DATA WITH EXTRA METHODS -class SkillData(ActionData): - def __init__(self): - super().__init__() - self.data_type = DataObject.skills - self.completed = "skillscompleted" - - def id_to_title_and_level(self, dictionary) -> dict: - title_and_id = [(self.data_type.find_one({"_id":item})["title"], self.data_type.find_one({"_id":item})["level"]) for item in dictionary] - return dict(zip(title_and_id, dictionary.values())) - - ## REWRITING THIS METHOD - def order_skills_by_popularity(self, user_parameter={}) -> list: - - # First create a list with the lists of skills that each user has completed and then unpack that list. - skill_list = [user["skillscompleted"] for user in self.users.find(user_parameter)] - - ### FIX TOTAL_LIST (FOR THE MOMENT IT RETURNS SKILL_LIST). USE INDECES - total_list = [skill for skill in skill_list] - total_dictionary = count_and_order(total_list) - skills = total_dictionary.keys() - - skill_descriptions = [self.skills.find_one({"_id":skill})["goals"] for skill in skills] - title_count = dict(zip(skill_descriptions, total_dictionary.values())) - - return title_count - - def get_skill_completion_rate(self, user_parameter={}, skill_parameter={}) -> dict: - from collections import Counter - users = self.users.find(user_parameter) - skills = [skill["_id"] for skill in self.skills.find(skill_parameter)] - completed_list = [] - progress_list = [] - - for user in users: - for completed in user["skillscompleted"]: - if completed in skills: - completed_list.append(completed) - for progress in user["skillsinprogress"]: - if progress in skills: - progress_list.append(progress) - - completed_counted = Counter(completed_list) - progress_counted = Counter(progress_list) - data_unordered = {key: {'Started': value + completed_counted[key], 'Progress': value, 'Completed': completed_counted[key], 'Score':float(completed_counted[key])/float(value+completed_counted[key])} for (key, value) in progress_counted.items()} - data_ordered = dict(sorted(data_unordered.items(), key=lambda x:x[1]['Score'])) - - return data_ordered - - def list_skills_by_ease(self, skill_parameter={}) -> dict: - data = self.get_skill_completion_rate(skill_parameter=skill_parameter) - keys = [self.skills.find_one({"_id":id})["goals"][0] for id in data.keys()] - values = [value['Score'] for value in data.values()] - total_dict = dict(zip(keys, values)) - return total_dict - - -class ChallengeData(ActionData): - def order_challenges_by_popularity(self, user_parameter={}) -> list: - total_list = [] - users = self.users.find(user_parameter) - for user in users: - for challenge in user["challengescompleted"]: - total_list.append(challenge) - - total_dictionary = OrderedDict(Counter(total_list).most_common()) - challenges = total_dictionary.keys() - - challenge_descriptions = [self.challenges.find_one({"_id":challenge})["goals"][0] for challenge in challenges] - title_count = dict(zip(challenge_descriptions, total_dictionary.values())) - - return title_count - - def get_challenge_completion_rate(self, user_parameter={}, challenge_parameter={}) -> dict: - users = self.users.find(user_parameter) - challenges = [challenge["_id"] for challenge in self.challenges.find(challenge_parameter)] - completed_list = [] - progress_list = [] - - for user in users: - for completed in user["challengescompleted"]: - if completed in challenges: - completed_list.append(completed) - for progress in user["challengesinprogress"]: - if progress in challenges: - progress_list.append(progress) - - completed_counted = Counter(completed_list) - progress_counted = Counter(progress_list) - data_unordered = {key: {'Started': value + completed_counted[key], 'Progress': value, 'Completed': completed_counted[key], 'Score':float(completed_counted[key])/float(value+completed_counted[key])} for (key, value) in progress_counted.items()} - data_ordered = dict(sorted(data_unordered.items(), key=lambda x:x[1]['Score'])) - - return data_ordered - - def get_challenge_ease(self, challenge_parameter={}) -> dict: - data = self.get_challenge_completion_rate(challenge_parameter=challenge_parameter) - keys = [self.challenges.find_one({"_id":id})["goals"][0] for id in data.keys()] - values = [value['Score'] for value in data.values()] - total_dict = dict(zip(keys, values)) - return total_dict - - -print(SkillData().id_to_goals(SkillData().order_by_popularity())) diff --git a/src/utilities/__init__.py b/src/utilities/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utilities/data.py b/src/utilities/data.py new file mode 100644 index 0000000..1e12a91 --- /dev/null +++ b/src/utilities/data.py @@ -0,0 +1,53 @@ +from pymongo import MongoClient +from pymongo.server_api import ServerApi +import os +import pandas as pd + +def fetch_data(dir: str) -> None: + ''' + Stores the data contained in the database locally. + ''' + + db_user = os.getenv("STDB_USER") + db_password = os.getenv("STDB_PASS") + client = MongoClient(f'mongodb+srv://{db_user}:{db_password}@adonis.n0u0i.mongodb.net/Database?retryWrites=true&w=majority', server_api=ServerApi('1')) + + db = client.Database + db_users = db.Users + db_challenges = db.Challenges + db_items = db.Items + db_skills = db.Skills + db_tasks = db.Tasks + + users = pd.DataFrame(list(db_users.find({}))) + challenges = pd.DataFrame(list(db_challenges.find({}))) + items = pd.DataFrame(list(db_items.find({}))) + skills = pd.DataFrame(list(db_skills.find({}))) + tasks = pd.DataFrame(list(db_tasks.find({}))) + + try: + os.mkdir(dir) + except FileExistsError: + pass + + users.to_csv(dir+"/users.csv") + challenges.to_csv(dir+"/challenges.csv") + items.to_csv(dir+"/items.csv") + skills.to_csv(dir+"/skills.csv") + tasks.to_csv(dir+"/tasks.csv") + + client.close() + +def read_data(dir: str) -> tuple: + ''' + Returns the data as a tuple of Dataframes + of users, challenges, items, skills and tasks. + ''' + + users = pd.read_csv(dir+"/users.csv") + challenges = pd.read_csv(dir+"/challenges.csv") + items = pd.read_csv(dir+"/items.csv") + skills = pd.read_csv(dir+"/skills.csv") + tasks = pd.read_csv(dir+"/tasks.csv") + + return users, challenges, items, skills, tasks diff --git a/src/utilities/skills.py b/src/utilities/skills.py new file mode 100644 index 0000000..a3ee751 --- /dev/null +++ b/src/utilities/skills.py @@ -0,0 +1,37 @@ +''' +Provides useful methods regarding skills. +''' + +import pandas as pd + +def add_completed(skills, users) -> pd.DataFrame: + ''' + Adds a column containing information on how many skills have been completed. + ''' + skills["completed"] = 0 + for i, skill in skills.iterrows(): + id = skill["_id"] + count = 0 + for j, user in users.iterrows(): + for completed in user["skillscompleted"]: + if completed==id: + count += 1 + skills["completed"][i] = count + + return skills + +def add_in_progress(skills, users): + ''' + Adds a column containing information on how many skills have been completed. + ''' + skills["completed"] = 0 + for i, skill in skills.iterrows(): + id = skill["_id"] + count = 0 + for j, user in users.iterrows(): + for completed in user["skillsinprogress"]: + if completed==id: + count += 1 + skills["completed"][i] = count + + return skills \ No newline at end of file diff --git a/src/utilities/users.py b/src/utilities/users.py new file mode 100644 index 0000000..c528f64 --- /dev/null +++ b/src/utilities/users.py @@ -0,0 +1,45 @@ +''' +Includes useful methods to process and analyse user data. +''' + +import pandas as pd + + +def process(users: pd.DataFrame) -> pd.DataFrame: + ''' + Deletes unnecessary information and modifies certain columns. + ''' + users = users.drop(["__v", "discordid", "Unnamed: 0"], axis=1) + users["reminderSent"] = users["reminderSent"].replace({True: 1, False: -1}) + return users + +def active(users: pd.DataFrame) -> pd.DataFrame: + ''' + Returns a new Dataframe including only active users. + ''' + return users[users["numDaysTracked"] > 0] + +def non_null(users: pd.DataFrame) -> pd.DataFrame: + ''' + Returns a new Dataframe including only users with non-null xp. + ''' + return users[users["xp"] != 0] + +def null(users: pd.DataFrame) -> pd.DataFrame: + ''' + Returns a new Dataframe including only users with null xp. + ''' + return users[users["xp"] == 0] + +def current(users: pd.DataFrame) -> pd.DataFrame: + ''' + Returns a new Dataframe including only users who have tracked in the week prior to + the current date. + ''' + raise NotImplementedError + +def coeff_variation(users: pd.DataFrame, field: str) -> pd.DataFrame: + ''' + Calculates the coefficient of variation of users for a given field. + ''' + return users[field].std()/users[field].mean() \ No newline at end of file From fd64b535f4b6131e47614d069ece81f44baa5de4 Mon Sep 17 00:00:00 2001 From: Alex Scofield Date: Tue, 13 Jun 2023 14:03:57 +0200 Subject: [PATCH 4/6] Skill Correlation notebook began. --- src/skill_correlation.ipynb | 478 ++++++++++++++++++++++++++++++++++++ src/utilities/users.py | 21 +- 2 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 src/skill_correlation.ipynb diff --git a/src/skill_correlation.ipynb b/src/skill_correlation.ipynb new file mode 100644 index 0000000..5d0bc8c --- /dev/null +++ b/src/skill_correlation.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Skills Correlations\n", + "In this Notebook we will analyse the correlations that there are between the completion rates of different skills." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import utilities.data as ud\n", + "import utilities.users as uu\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "DATA_DIR = \"./data\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Execute only if you want to fetch the data from the Database.\n", + "ud.fetch_data(DATA_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "users, challenges, items, skills, tasks = ud.read_data(DATA_DIR)\n", + "\n", + "# We are only interested in non-null users.\n", + "users = uu.process(uu.non_null(users))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SKILLSC 1\n", + "Name: 5, dtype: object\n", + "relationshipsC 1\n", + "Name: 5, dtype: object\n", + "skillsC 1\n", + "Name: 5, dtype: object\n", + "MENTAL HEALTHC 1\n", + "Name: 5, dtype: object\n", + "masculinityC 1\n", + "Name: 5, dtype: object\n", + "disciplineC 1\n", + "Name: 5, dtype: object\n", + "SKILLSC 2\n", + "Name: 5, dtype: object\n", + "MENTAL HEALTHC 1\n", + "Name: 8, dtype: object\n", + "fitnessC 1\n", + "Name: 8, dtype: object\n", + "SKILLSC 1\n", + "Name: 8, dtype: object\n", + "DietC 1\n", + "Name: 8, dtype: object\n", + "MENTAL HEALTHC 2\n", + "Name: 8, dtype: object\n", + "skillsC 1\n", + "Name: 8, dtype: object\n", + "mindfulnessC 1\n", + "Name: 8, dtype: object\n", + "disciplineC 1\n", + "Name: 8, dtype: object\n", + "SKILLSC 2\n", + "Name: 8, dtype: object\n", + "SKILLSC 1\n", + "Name: 16, dtype: object\n", + "MENTAL HEALTHC 1\n", + "Name: 16, dtype: object\n", + "skillsC 1\n", + "Name: 16, dtype: object\n", + "fitnessC 1\n", + "Name: 16, dtype: object\n", + "MENTAL HEALTHC 2\n", + "Name: 16, dtype: object\n", + "SKILLSC 2\n", + "Name: 16, dtype: object\n", + "relationshipsC 1\n", + "Name: 16, dtype: object\n", + "SKILLSC 3\n", + "Name: 16, dtype: object\n", + "masculinityC 1\n", + "Name: 16, dtype: object\n", + "SKILLSC 4\n", + "Name: 16, dtype: object\n", + "disciplineC 1\n", + "Name: 16, dtype: object\n", + "SKILLSC 5\n", + "Name: 16, dtype: object\n", + "MENTAL HEALTHC 1\n", + "Name: 22, dtype: object\n", + "fitnessC 1\n", + "Name: 22, dtype: object\n", + "SKILLSC 1\n", + "Name: 22, dtype: object\n", + "disciplineC 1\n", + "Name: 22, dtype: object\n", + "MENTAL HEALTHC 2\n", + "Name: 22, dtype: object\n", + "routinesC 1\n", + "Name: 22, dtype: object\n", + "DietC 1\n", + "Name: 22, dtype: object\n", + "mindfulnessC 1\n", + "Name: 22, dtype: object\n", + "masculinityC 1\n", + "Name: 22, dtype: object\n", + "SKILLSC 2\n", + "Name: 22, dtype: object\n", + "mindfulnessC 2\n", + "Name: 22, dtype: object\n", + "screentimeC 1\n", + "Name: 22, dtype: object\n", + "fitnessC 2\n", + "Name: 22, dtype: object\n", + "SKILLSC 3\n", + "Name: 22, dtype: object\n", + "mindfulnessC 3\n", + "Name: 22, dtype: object\n", + "routinesC 2\n", + "Name: 22, dtype: object\n", + "hydrationC 1\n", + "Name: 22, dtype: object\n", + "screentimeC 2\n", + "Name: 22, dtype: object\n", + "routinesC 3\n", + "Name: 22, dtype: object\n", + "hydrationC 2\n", + "Name: 22, dtype: object\n", + "SKILLSC 1\n", + "Name: 23, dtype: object\n", + "disciplineC 1\n", + "Name: 23, dtype: object\n", + "skillsC 1\n", + "Name: 23, dtype: object\n", + "SKILLSC 2\n", + "Name: 23, dtype: object\n", + "SKILLSC 3\n", + "Name: 23, dtype: object\n", + "SKILLSC 4\n", + "Name: 23, dtype: object\n", + "SKILLSC 5\n", + "Name: 23, dtype: object\n", + "SKILLSC 6\n", + "Name: 23, dtype: object\n", + "SKILLSC 7\n", + "Name: 23, dtype: object\n", + "SKILLSC 8\n", + "Name: 23, dtype: object\n", + "MENTAL HEALTHC 1\n", + "Name: 23, dtype: object\n", + "MENTAL HEALTHC 2\n", + "Name: 23, dtype: object\n", + "fitnessC 1\n", + "Name: 23, dtype: object\n", + "SKILLSC 9\n", + "Name: 23, dtype: object\n", + "relationshipsC 1\n", + "Name: 23, dtype: object\n", + "DietC 1\n", + "Name: 23, dtype: object\n" + ] + } + ], + "source": [ + "# This cell can take some time to execute.\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "users = uu.add_completions_per_category(users.head(), skills)\n", + "warnings.resetwarnings()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idxpxpHistoryitemsskillscompletedskillsinprogresschallengescompletedchallengesinprogresscharactertimezone...mindfulnessCroutinesCDietChydrationCattractionCscreentimeCrelationshipsCdisciplineCmasculinityCskillsC
562c4f336953318ddc80d82a0300[0, 300, 300, 300, 300][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20b2'), ObjectI...[ObjectId('62c226d89efefadfd10e219a'), ObjectI...[][]male-8.0...0000000000
862c4f357953318ddc80d82c7100[0, 100][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20ad'), ObjectI...[ObjectId('62c226d19efefadfd10e20d9'), ObjectI...[][ObjectId('62c226d09efefadfd10e20bb')]male-5.0...0000000000
1662c4f3fe953318ddc80d8369860[0, 860][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20b2'), ObjectI...[ObjectId('62c226d09efefadfd10e20b6'), ObjectI...[ObjectId('62c226d09efefadfd10e20bb')][]male-5.0...0000000000
2262c4f691953318ddc80d84c16890[0, 1170, 1840, 2890, 2890, 3490, 4290, 4290, ...[ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20ad'), ObjectI...[ObjectId('62c226d89efefadfd10e21a0'), ObjectI...[ObjectId('62c226d09efefadfd10e20bb'), ObjectI...[]male-5.0...0000000000
2362c4f6a1953318ddc80d84cc1000[0, 1000][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20b2'), ObjectI...[ObjectId('62c226d89efefadfd10e2197'), ObjectI...[ObjectId('62c226e09efefadfd10e2267'), ObjectI...[]male-4.0...0000000000
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " _id xp \\\n", + "5 62c4f336953318ddc80d82a0 300 \n", + "8 62c4f357953318ddc80d82c7 100 \n", + "16 62c4f3fe953318ddc80d8369 860 \n", + "22 62c4f691953318ddc80d84c1 6890 \n", + "23 62c4f6a1953318ddc80d84cc 1000 \n", + "\n", + " xpHistory \\\n", + "5 [0, 300, 300, 300, 300] \n", + "8 [0, 100] \n", + "16 [0, 860] \n", + "22 [0, 1170, 1840, 2890, 2890, 3490, 4290, 4290, ... \n", + "23 [0, 1000] \n", + "\n", + " items \\\n", + "5 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "8 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "16 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "22 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "23 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", + "\n", + " skillscompleted \\\n", + "5 [ObjectId('62c226cf9efefadfd10e20b2'), ObjectI... \n", + "8 [ObjectId('62c226cf9efefadfd10e20ad'), ObjectI... \n", + "16 [ObjectId('62c226cf9efefadfd10e20b2'), ObjectI... \n", + "22 [ObjectId('62c226cf9efefadfd10e20ad'), ObjectI... \n", + "23 [ObjectId('62c226cf9efefadfd10e20b2'), ObjectI... \n", + "\n", + " skillsinprogress \\\n", + "5 [ObjectId('62c226d89efefadfd10e219a'), ObjectI... \n", + "8 [ObjectId('62c226d19efefadfd10e20d9'), ObjectI... \n", + "16 [ObjectId('62c226d09efefadfd10e20b6'), ObjectI... \n", + "22 [ObjectId('62c226d89efefadfd10e21a0'), ObjectI... \n", + "23 [ObjectId('62c226d89efefadfd10e2197'), ObjectI... \n", + "\n", + " challengescompleted \\\n", + "5 [] \n", + "8 [] \n", + "16 [ObjectId('62c226d09efefadfd10e20bb')] \n", + "22 [ObjectId('62c226d09efefadfd10e20bb'), ObjectI... \n", + "23 [ObjectId('62c226e09efefadfd10e2267'), ObjectI... \n", + "\n", + " challengesinprogress character timezone ... \\\n", + "5 [] male -8.0 ... \n", + "8 [ObjectId('62c226d09efefadfd10e20bb')] male -5.0 ... \n", + "16 [] male -5.0 ... \n", + "22 [] male -5.0 ... \n", + "23 [] male -4.0 ... \n", + "\n", + " mindfulnessC routinesC DietC hydrationC attractionC screentimeC \\\n", + "5 0 0 0 0 0 0 \n", + "8 0 0 0 0 0 0 \n", + "16 0 0 0 0 0 0 \n", + "22 0 0 0 0 0 0 \n", + "23 0 0 0 0 0 0 \n", + "\n", + " relationshipsC disciplineC masculinityC skillsC \n", + "5 0 0 0 0 \n", + "8 0 0 0 0 \n", + "16 0 0 0 0 \n", + "22 0 0 0 0 \n", + "23 0 0 0 0 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.head()\n", + "#users[[category + \"C\" for category in skills[\"category\"].unique()]].corr()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/utilities/users.py b/src/utilities/users.py index c528f64..4633da7 100644 --- a/src/utilities/users.py +++ b/src/utilities/users.py @@ -42,4 +42,23 @@ def coeff_variation(users: pd.DataFrame, field: str) -> pd.DataFrame: ''' Calculates the coefficient of variation of users for a given field. ''' - return users[field].std()/users[field].mean() \ No newline at end of file + return users[field].std()/users[field].mean() + +def add_completions_per_category(users: pd.DataFrame, skills: pd.DataFrame) -> pd.DataFrame: + ''' + Adds a new column per category of skills detailing the amount of completions each user has + for that given category. + ''' + import ast + import re + users_extended = users + for category in skills["category"].unique(): + users_extended[category+"C"]=0 + + for i, user in users_extended.iterrows(): + clean_str = re.sub(r"ObjectId\('(.+?)'\)", r"'\1'", user["skillscompleted"]) + for completed in ast.literal_eval(clean_str): + skill = skills[skills["_id"] == completed] + category = skill["category"] + user[category+"C"] = user[category+"C"] + 1 + return users_extended \ No newline at end of file From d7aceb21a38d463c7f68c6881d1fd7a7f29b1802 Mon Sep 17 00:00:00 2001 From: Alex Scofield Date: Mon, 19 Jun 2023 16:51:34 +0200 Subject: [PATCH 5/6] New Notebook added analysing the correlations between completion rates of different skills. --- src/outlier_analysis.ipynb | 2 +- src/skill_correlation.ipynb | 525 +++++++++--------------------------- src/utilities/users.py | 4 +- 3 files changed, 133 insertions(+), 398 deletions(-) diff --git a/src/outlier_analysis.ipynb b/src/outlier_analysis.ipynb index a298f85..90f5358 100644 --- a/src/outlier_analysis.ipynb +++ b/src/outlier_analysis.ipynb @@ -448,7 +448,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 9, diff --git a/src/skill_correlation.ipynb b/src/skill_correlation.ipynb index 5d0bc8c..e1248c7 100644 --- a/src/skill_correlation.ipynb +++ b/src/skill_correlation.ipynb @@ -6,7 +6,15 @@ "metadata": {}, "source": [ "# Skills Correlations\n", - "In this Notebook we will analyse the correlations that there are between the completion rates of different skills." + "In this Notebook we will analyse the correlations that exist between the completion rates of different skills." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imports and Load Data" ] }, { @@ -19,430 +27,78 @@ "import utilities.users as uu\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", "\n", "DATA_DIR = \"./data\"" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Execute only if you want to fetch the data from the Database.\n", + "# Execute this cell *only* if you wish to fetch the data from the Database.\n", "ud.fetch_data(DATA_DIR)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data processing\n", + "Note that we're only interested in non-null users (those with more than 0 *xp* points) for the entirety of this Notebook." + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "users, challenges, items, skills, tasks = ud.read_data(DATA_DIR)\n", - "\n", - "# We are only interested in non-null users.\n", "users = uu.process(uu.non_null(users))" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SKILLSC 1\n", - "Name: 5, dtype: object\n", - "relationshipsC 1\n", - "Name: 5, dtype: object\n", - "skillsC 1\n", - "Name: 5, dtype: object\n", - "MENTAL HEALTHC 1\n", - "Name: 5, dtype: object\n", - "masculinityC 1\n", - "Name: 5, dtype: object\n", - "disciplineC 1\n", - "Name: 5, dtype: object\n", - "SKILLSC 2\n", - "Name: 5, dtype: object\n", - "MENTAL HEALTHC 1\n", - "Name: 8, dtype: object\n", - "fitnessC 1\n", - "Name: 8, dtype: object\n", - "SKILLSC 1\n", - "Name: 8, dtype: object\n", - "DietC 1\n", - "Name: 8, dtype: object\n", - "MENTAL HEALTHC 2\n", - "Name: 8, dtype: object\n", - "skillsC 1\n", - "Name: 8, dtype: object\n", - "mindfulnessC 1\n", - "Name: 8, dtype: object\n", - "disciplineC 1\n", - "Name: 8, dtype: object\n", - "SKILLSC 2\n", - "Name: 8, dtype: object\n", - "SKILLSC 1\n", - "Name: 16, dtype: object\n", - "MENTAL HEALTHC 1\n", - "Name: 16, dtype: object\n", - "skillsC 1\n", - "Name: 16, dtype: object\n", - "fitnessC 1\n", - "Name: 16, dtype: object\n", - "MENTAL HEALTHC 2\n", - "Name: 16, dtype: object\n", - "SKILLSC 2\n", - "Name: 16, dtype: object\n", - "relationshipsC 1\n", - "Name: 16, dtype: object\n", - "SKILLSC 3\n", - "Name: 16, dtype: object\n", - "masculinityC 1\n", - "Name: 16, dtype: object\n", - "SKILLSC 4\n", - "Name: 16, dtype: object\n", - "disciplineC 1\n", - "Name: 16, dtype: object\n", - "SKILLSC 5\n", - "Name: 16, dtype: object\n", - "MENTAL HEALTHC 1\n", - "Name: 22, dtype: object\n", - "fitnessC 1\n", - "Name: 22, dtype: object\n", - "SKILLSC 1\n", - "Name: 22, dtype: object\n", - "disciplineC 1\n", - "Name: 22, dtype: object\n", - "MENTAL HEALTHC 2\n", - "Name: 22, dtype: object\n", - "routinesC 1\n", - "Name: 22, dtype: object\n", - "DietC 1\n", - "Name: 22, dtype: object\n", - "mindfulnessC 1\n", - "Name: 22, dtype: object\n", - "masculinityC 1\n", - "Name: 22, dtype: object\n", - "SKILLSC 2\n", - "Name: 22, dtype: object\n", - "mindfulnessC 2\n", - "Name: 22, dtype: object\n", - "screentimeC 1\n", - "Name: 22, dtype: object\n", - "fitnessC 2\n", - "Name: 22, dtype: object\n", - "SKILLSC 3\n", - "Name: 22, dtype: object\n", - "mindfulnessC 3\n", - "Name: 22, dtype: object\n", - "routinesC 2\n", - "Name: 22, dtype: object\n", - "hydrationC 1\n", - "Name: 22, dtype: object\n", - "screentimeC 2\n", - "Name: 22, dtype: object\n", - "routinesC 3\n", - "Name: 22, dtype: object\n", - "hydrationC 2\n", - "Name: 22, dtype: object\n", - "SKILLSC 1\n", - "Name: 23, dtype: object\n", - "disciplineC 1\n", - "Name: 23, dtype: object\n", - "skillsC 1\n", - "Name: 23, dtype: object\n", - "SKILLSC 2\n", - "Name: 23, dtype: object\n", - "SKILLSC 3\n", - "Name: 23, dtype: object\n", - "SKILLSC 4\n", - "Name: 23, dtype: object\n", - "SKILLSC 5\n", - "Name: 23, dtype: object\n", - "SKILLSC 6\n", - "Name: 23, dtype: object\n", - "SKILLSC 7\n", - "Name: 23, dtype: object\n", - "SKILLSC 8\n", - "Name: 23, dtype: object\n", - "MENTAL HEALTHC 1\n", - "Name: 23, dtype: object\n", - "MENTAL HEALTHC 2\n", - "Name: 23, dtype: object\n", - "fitnessC 1\n", - "Name: 23, dtype: object\n", - "SKILLSC 9\n", - "Name: 23, dtype: object\n", - "relationshipsC 1\n", - "Name: 23, dtype: object\n", - "DietC 1\n", - "Name: 23, dtype: object\n" - ] - } - ], + "outputs": [], "source": [ - "# This cell can take some time to execute.\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "users = uu.add_completions_per_category(users.head(), skills)\n", - "warnings.resetwarnings()" + "# This cell may take some time to execute.\n", + "users = uu.add_completions_per_category(users, skills)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Correlation Matrix\n", + "In the following section we will be analysing the *correlation matrix* of the completion rates of skills per category.\n", + "\n", + "Any negative values in this matrix should be analysed in detail, as this would mean that completing certain skills would make it less likely for users to complete others. Of course, this result is not expected." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
_idxpxpHistoryitemsskillscompletedskillsinprogresschallengescompletedchallengesinprogresscharactertimezone...mindfulnessCroutinesCDietChydrationCattractionCscreentimeCrelationshipsCdisciplineCmasculinityCskillsC
562c4f336953318ddc80d82a0300[0, 300, 300, 300, 300][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20b2'), ObjectI...[ObjectId('62c226d89efefadfd10e219a'), ObjectI...[][]male-8.0...0000000000
862c4f357953318ddc80d82c7100[0, 100][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20ad'), ObjectI...[ObjectId('62c226d19efefadfd10e20d9'), ObjectI...[][ObjectId('62c226d09efefadfd10e20bb')]male-5.0...0000000000
1662c4f3fe953318ddc80d8369860[0, 860][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20b2'), ObjectI...[ObjectId('62c226d09efefadfd10e20b6'), ObjectI...[ObjectId('62c226d09efefadfd10e20bb')][]male-5.0...0000000000
2262c4f691953318ddc80d84c16890[0, 1170, 1840, 2890, 2890, 3490, 4290, 4290, ...[ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20ad'), ObjectI...[ObjectId('62c226d89efefadfd10e21a0'), ObjectI...[ObjectId('62c226d09efefadfd10e20bb'), ObjectI...[]male-5.0...0000000000
2362c4f6a1953318ddc80d84cc1000[0, 1000][ObjectId('62c382d46cac02c487e243cb'), ObjectI...[ObjectId('62c226cf9efefadfd10e20b2'), ObjectI...[ObjectId('62c226d89efefadfd10e2197'), ObjectI...[ObjectId('62c226e09efefadfd10e2267'), ObjectI...[]male-4.0...0000000000
\n", - "

5 rows × 27 columns

\n", - "
" - ], - "text/plain": [ - " _id xp \\\n", - "5 62c4f336953318ddc80d82a0 300 \n", - "8 62c4f357953318ddc80d82c7 100 \n", - "16 62c4f3fe953318ddc80d8369 860 \n", - "22 62c4f691953318ddc80d84c1 6890 \n", - "23 62c4f6a1953318ddc80d84cc 1000 \n", - "\n", - " xpHistory \\\n", - "5 [0, 300, 300, 300, 300] \n", - "8 [0, 100] \n", - "16 [0, 860] \n", - "22 [0, 1170, 1840, 2890, 2890, 3490, 4290, 4290, ... \n", - "23 [0, 1000] \n", - "\n", - " items \\\n", - "5 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", - "8 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", - "16 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", - "22 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", - "23 [ObjectId('62c382d46cac02c487e243cb'), ObjectI... \n", - "\n", - " skillscompleted \\\n", - "5 [ObjectId('62c226cf9efefadfd10e20b2'), ObjectI... \n", - "8 [ObjectId('62c226cf9efefadfd10e20ad'), ObjectI... \n", - "16 [ObjectId('62c226cf9efefadfd10e20b2'), ObjectI... \n", - "22 [ObjectId('62c226cf9efefadfd10e20ad'), ObjectI... \n", - "23 [ObjectId('62c226cf9efefadfd10e20b2'), ObjectI... \n", - "\n", - " skillsinprogress \\\n", - "5 [ObjectId('62c226d89efefadfd10e219a'), ObjectI... \n", - "8 [ObjectId('62c226d19efefadfd10e20d9'), ObjectI... \n", - "16 [ObjectId('62c226d09efefadfd10e20b6'), ObjectI... \n", - "22 [ObjectId('62c226d89efefadfd10e21a0'), ObjectI... \n", - "23 [ObjectId('62c226d89efefadfd10e2197'), ObjectI... \n", - "\n", - " challengescompleted \\\n", - "5 [] \n", - "8 [] \n", - "16 [ObjectId('62c226d09efefadfd10e20bb')] \n", - "22 [ObjectId('62c226d09efefadfd10e20bb'), ObjectI... \n", - "23 [ObjectId('62c226e09efefadfd10e2267'), ObjectI... \n", - "\n", - " challengesinprogress character timezone ... \\\n", - "5 [] male -8.0 ... \n", - "8 [ObjectId('62c226d09efefadfd10e20bb')] male -5.0 ... \n", - "16 [] male -5.0 ... \n", - "22 [] male -5.0 ... \n", - "23 [] male -4.0 ... \n", - "\n", - " mindfulnessC routinesC DietC hydrationC attractionC screentimeC \\\n", - "5 0 0 0 0 0 0 \n", - "8 0 0 0 0 0 0 \n", - "16 0 0 0 0 0 0 \n", - "22 0 0 0 0 0 0 \n", - "23 0 0 0 0 0 0 \n", - "\n", - " relationshipsC disciplineC masculinityC skillsC \n", - "5 0 0 0 0 \n", - "8 0 0 0 0 \n", - "16 0 0 0 0 \n", - "22 0 0 0 0 \n", - "23 0 0 0 0 \n", - "\n", - "[5 rows x 27 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "users.head()\n", - "#users[[category + \"C\" for category in skills[\"category\"].unique()]].corr()" + "users_completion_rates = users[[category + \"C\" for category in skills[\"category\"].unique()]]\n", + "corr_matrix = users_completion_rates.corr()\n", + "corr_matrix" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following heatmap provides a more intuitive visualisation of the *correlation matrix*." ] }, { @@ -450,7 +106,86 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "sns.heatmap(corr_matrix)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now calculate the *mean correlation* for each of the categories. This is a measure of the *leverage* that each category has. A higher number means that users that complete skills in that category are more likely to complete skills in other ones. It would therefore be a good idea to encourage users to complete high leverage skills, to encourage more activity throughout the entire *Skill Tree*.\n", + "\n", + "It is important to interpret these results in context. For example, if the score for *MENTAL HEALTH* is low, it could mean that some of its skills are very easy to complete and that some users have only completed skills in this category. It is also important to note that certain categories do not contain many skills.\n", + "\n", + "It would also be possible to modify this Notebook in order to only analyse users in the upper quartiles of *xp*, to minimise the effect explained above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corr_matrix.mean()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another informative way to view the correlations is with respect to the mean correlation. In this way, a value of 2 in the following matrix would mean that those two skills correlate twice as much as the mean correlation. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corr_matrix/corr_matrix.unstack().mean()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another visualisation for completion rates is the following scatter plot. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is an example for mindfulness and screentime.\n", + "plt.title(\"Completions per category\")\n", + "plt.ylabel(\"Mindfulness\")\n", + "plt.xlabel(\"Screentime\")\n", + "\n", + "plt.scatter(users_completion_rates[\"screentimeC\"], users_completion_rates[\"mindfulnessC\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can display all such *scatter plots* at once in the following *pair plot*. Also note that the diagonal displays a univariate distribution plot to show the marginal distribution of the data in each column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.pairplot(users_completion_rates)" + ] } ], "metadata": { diff --git a/src/utilities/users.py b/src/utilities/users.py index 4633da7..b045aab 100644 --- a/src/utilities/users.py +++ b/src/utilities/users.py @@ -53,12 +53,12 @@ def add_completions_per_category(users: pd.DataFrame, skills: pd.DataFrame) -> p import re users_extended = users for category in skills["category"].unique(): - users_extended[category+"C"]=0 + users_extended.loc[:, category+"C"] = 0 for i, user in users_extended.iterrows(): clean_str = re.sub(r"ObjectId\('(.+?)'\)", r"'\1'", user["skillscompleted"]) for completed in ast.literal_eval(clean_str): skill = skills[skills["_id"] == completed] category = skill["category"] - user[category+"C"] = user[category+"C"] + 1 + users_extended.at[i, category+"C"] = users_extended.loc[i, category+"C"] + 1 return users_extended \ No newline at end of file From 5ff0aef04a31cda4516a65e7f39e1a0415128f41 Mon Sep 17 00:00:00 2001 From: Alex Scofield Date: Tue, 20 Jun 2023 17:26:08 +0200 Subject: [PATCH 6/6] Distances between users implemented. --- src/user_clustering.ipynb | 101 ++++++++++++++++++++++++++++++++++++++ src/utilities/skills.py | 8 ++- src/utilities/users.py | 47 +++++++++++++++++- 3 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 src/user_clustering.ipynb diff --git a/src/user_clustering.ipynb b/src/user_clustering.ipynb new file mode 100644 index 0000000..2fd2ab8 --- /dev/null +++ b/src/user_clustering.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# User Distances\n", + "In this Notebook we will be looking at different ways of calculating distances between users. This way we can suggest skills by comparing to completed skills of other similar users." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import utilities.data as ud\n", + "import utilities.skills as us\n", + "import utilities.users as uu\n", + "import pandas as pd\n", + "import matplotlib as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "\n", + "DATA_DIR = \"./data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Execute only if you want to fetch the data from the Database.\n", + "ud.fetch_data(DATA_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "users, challenges, items, skills, tasks = ud.read_data(DATA_DIR)\n", + "users = uu.process(uu.non_null(users))\n", + "#We will be using users in the upper quantiles for these examples.\n", + "users = users[users[\"xp\"]>users[\"xp\"].quantile(0.8)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "users = uu.add_skills_completed(users, skills)\n", + "users.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Here's an example of an epsilon neighborhood.\n", + "uu.epsilon_neighborhood(users.iloc[4], users, skills, 6)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/utilities/skills.py b/src/utilities/skills.py index a3ee751..863a138 100644 --- a/src/utilities/skills.py +++ b/src/utilities/skills.py @@ -34,4 +34,10 @@ def add_in_progress(skills, users): count += 1 skills["completed"][i] = count - return skills \ No newline at end of file + return skills + +def id_to_title(skills: pd.DataFrame, id: str) -> str: + ''' + Takes the id of a skill and returns its title. + ''' + return str(skills[skills["_id"] == id]["title"][0]) \ No newline at end of file diff --git a/src/utilities/users.py b/src/utilities/users.py index b045aab..78a3eda 100644 --- a/src/utilities/users.py +++ b/src/utilities/users.py @@ -3,6 +3,7 @@ ''' import pandas as pd +import numpy as np def process(users: pd.DataFrame) -> pd.DataFrame: @@ -61,4 +62,48 @@ def add_completions_per_category(users: pd.DataFrame, skills: pd.DataFrame) -> p skill = skills[skills["_id"] == completed] category = skill["category"] users_extended.at[i, category+"C"] = users_extended.loc[i, category+"C"] + 1 - return users_extended \ No newline at end of file + return users_extended + +def add_skills_completed(users:pd.DataFrame, skills:pd.DataFrame) -> pd.DataFrame: + ''' + Adds a new column per skill indicating wether the user has completed said skill. + ''' + import re + import ast + + users_extended = users + for i, skill in skills.iterrows(): + users_extended.loc[:, skill["title"]] = 0 + + for i, user in users_extended.iterrows(): + clean_str = re.sub(r"ObjectId\('(.+?)'\)", r"'\1'", user["skillscompleted"]) + for completed in ast.literal_eval(clean_str): + skill = skills[skills["_id"] == completed] + users_extended.at[i, skill["title"]] = users_extended.loc[i, skill["title"]] + 1 + + return users_extended + +def distance(user1: pd.Series, user2: pd.Series, skills: pd.DataFrame) -> float: + ''' + Calculates the completion distance between two users. + + Note that the completion rates must have been added. + ''' + user1_vect = user1[[title for title in skills["title"].unique()]] + user2_vect = user2[[title for title in skills["title"].unique()]] + return np.sqrt(np.array(user1_vect)@np.array(user2_vect)) + +def epsilon_neighborhood(user:pd.DataFrame, users:pd.DataFrame, skills:pd.DataFrame, epsilon:float) -> pd.DataFrame: + ''' + Returns a DataFrame containing all the users that are at a certain distance away from a given user. + + Note that the completion rates must have been added. + ''' + series = [] + for i, user2 in users.iterrows(): + if user2["_id"] != user["_id"]: + if distance(user, user2, skills) <= epsilon: + series.append(user2) + + return pd.DataFrame(series) +