moest-np · xiring · Jun 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,27 @@
+# Ignore virtual environment
+venv/
+ENV/
+env/
+.venv/
+
+# Ignore Python cache files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Ignore Jupyter Notebook checkpoints
+.ipynb_checkpoints
+
+# Ignore specific data files
+data/*.tsv
+data/*.csv
+
+# Ignore the output CSV file
+school_matches.csv
+
+# Ignore system files
+.DS_Store
+Thumbs.db
+
+# Ignore VSCode settings
+.vscode/
diff --git a/README.md b/README.md
@@ -1,4 +1,70 @@
 # Incubator
-Problem statements, discussions and prototypes
 
-This repo will host problem statements and inception discussions. Baselined requirements and context will be in project specific folder and accompanying discussion will be in discussion section.
+Problem statements, discussions, and prototypes
+
+This repo will host problem statements and inception discussions. Baselined requirements and context will be in the project-specific folder, and accompanying discussions will be in the discussion section.
+
+## Setup Instructions
+
+1. **Clone the repository** (if applicable) or navigate to the project directory.
+
+2. **Create a virtual environment**:
+
+    ```sh
+    python3 -m venv venv
+    ```
+
+3. **Activate the virtual environment**:
+
+    - On Windows:
+
+      ```sh
+      venv\Scripts\activate
+      ```
+
+    - On macOS/Linux:
+
+      ```sh
+      source venv/bin/activate
+      ```
+
+4. **Install the required packages**:
+
+    ```sh
+    pip install -r requirements.txt
+    ```
+
+## Running the Script
+
+1. **Ensure your data files are placed in the `data/` directory**.
+
+2. **Run the script**:
+
+    ```sh
+    python school_mapping.py
+    ```
+
+3. **Output**: The script will generate a `school_mapping_results.csv` file containing the matched schools along with their district information.
+
+## Matching Logic
+
+- The script transliterates school names from Devanagari to Roman script and matches them with the names in English.
+- District information is used to filter potential matches to ensure they are from the same district.
+- Fuzzy matching is applied to find the best match based on the transliterated school names.
+- Matches with a score above a specified threshold (default: 80) are included in the final output.
+
+## Output Fields
+
+- `school_id_a`: School ID from Source A
+- `school_id_b`: School ID from Source B
+- `match_score`: Fuzzy match score
+- `school_name_a`: School name from Source A
+- `school_name_b`: School name from Source B
+- `district_id_a`: District ID from Source A
+- `district_a`: District name from Source A
+- `district_b`: District name from Source B
+
+## Dependencies
+
+- pandas
+- rapidfuzz
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+rapidfuzz
+translate
+transliterate
diff --git a/school_mapping.py b/school_mapping.py
@@ -0,0 +1,87 @@
+import pandas as pd
+from rapidfuzz import fuzz, process
+from indic_transliteration import sanscript
+import re
+
+# Load data
+source_a = pd.read_csv('2024-05_school_mapping/data/school_list_A.tsv', sep='\t')
+source_b = pd.read_csv('2024-05_school_mapping/data/school_list_B.tsv', sep='\t')
+jilla = pd.read_csv('2024-05_school_mapping/data/jilla.tsv', sep='\t')
+
+# Function to transliterate Devanagari text to Romanized text using Velthuis method
+def transliterate_text(text):
+    return sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.VELTHUIS)
+
+# Clean and normalize data
+source_a['velthuis'] = source_a['school'].apply(lambda x: transliterate_text(x)).str.lower().str.strip()
+source_a['district1'] = source_a['district1'].str.lower().str.strip()
+source_b['district'] = source_b['district'].str.lower().str.strip()
+
+# Create a dictionary for district name to district id mapping in Source B
+district_mapping_b = source_b[['district', 'district_id']].drop_duplicates().set_index('district')['district_id'].to_dict()
+
+# Create a dictionary for district name to district id mapping in Source A
+district_mapping_a = jilla.set_index('जिल्ला')['district_id'].to_dict()
+
+# Function to escape special characters for regex
+def escape_regex(text):
+    return re.escape(text)
+
+# Function to match schools based on transliteration and district
+def match_schools(source_a, source_b, district_mapping_a, district_mapping_b, threshold=70):
+    matches = []
+
+    for index, row in source_a.iterrows():
+        school_id_a = row['school_id']
+        velthuis_name = row['velthuis']
+        district_a = row['district1']
+
+        # Get district id from district name in Source A
+        district_id_a = district_mapping_a.get(district_a)
+
+        if district_id_a is not None:
+            # Filter Source B schools by district_id
+            possible_matches = source_b[source_b['district_id'] == district_id_a]
+
+            # Combine names and old names for matching
+            possible_names = possible_matches['name'].tolist() + possible_matches[['old_name1', 'old_name2', 'old_name3']].stack().tolist()
+
+            # Apply fuzzy matching on combined names
+            best_match = process.extractOne(velthuis_name, possible_names, scorer=fuzz.token_sort_ratio)
+
+            if best_match and best_match[1] >= threshold:
+                best_match_name = best_match[0]
+
+                # Determine if best match is from current or old names
+                if best_match_name in possible_matches['name'].values:
+                    best_match_row = possible_matches[possible_matches['name'] == best_match_name].iloc[0]
+                else:
+                    old_name_matches = possible_matches[possible_matches[['old_name1', 'old_name2', 'old_name3']].apply(lambda x: best_match_name in x.values, axis=1)]
+                    if not old_name_matches.empty:
+                        best_match_row = old_name_matches.iloc[0]
+                    else:
+                        continue
+
+                school_id_b = best_match_row['school_id']
+
+                # Append the match result
+                matches.append({
+                    'school_id_a': school_id_a,
+                    'school_id_b': school_id_b,
+                    'match_score': best_match[1],
+                    'school_name_a': row['school'],
+                    'school_name_b': best_match_row['name'],
+                    'district_id_a': district_id_a,
+                    'district_a': district_a,
+                    'district_b': best_match_row['district']
+                })
+
+    return pd.DataFrame(matches)
+
+# Run the matching function
+matched_schools = match_schools(source_a, source_b, district_mapping_a, district_mapping_b)
+
+# Save the matching results to a CSV file
+matched_schools.to_csv('school_mapping_results.csv', index=False)
+
+print(f"Total matches found: {len(matched_schools)}")