-
Notifications
You must be signed in to change notification settings - Fork 622
Description
I keep getting the assertion error, and I don't understand why. I train on the user-matrix, and all seems well. But the number of users still seems to be the number of items, which is wrong.
Here is the code:
`import pandas as pd, numpy as np, warnings
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import implicit # v0.6+ recommended
warnings.filterwarnings("ignore", category=UserWarning)
──────────────────────────────────────────────────────────────
CONFIG
──────────────────────────────────────────────────────────────
DATA_CSV = "../data/orders_data.csv"
USER_COL = "_id"
ITEM_COL = "lead"
TIME_COL = "order_datetime"
MIN_INTER = 1
TEST_DAYS = 7
FACTORS = 64
TOP_K = 10
──────────────────────────────────────────────────────────────
df = pd.read_csv(DATA_CSV)
df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")
df = df[df[TIME_COL].notna()].sort_values(TIME_COL)
implicit rating
df["rating"] = 1.0
1 — filter users with too few interactions (whole window)
valid_users = df[USER_COL].value_counts().loc[lambda s: s >= MIN_INTER].index
df = df[df[USER_COL].isin(valid_users)]
2 — chronological split
cutoff = df[TIME_COL].max() - pd.Timedelta(days=TEST_DAYS)
train = df[df[TIME_COL] < cutoff].copy()
test = df[df[TIME_COL] >= cutoff].copy()
3 — build maps on the TRAIN slice only
user2idx = {u: i for i, u in enumerate(train[USER_COL].unique())}
item2idx = {i: j for j, i in enumerate(train[ITEM_COL].unique())}
train["u"] = train[USER_COL].map(user2idx)
train["i"] = train[ITEM_COL].map(item2idx)
4 — build user × item CSR with NO empty rows/cols
n_users, n_items = len(user2idx), len(item2idx)
train_mat = csr_matrix(
(train["rating"].astype(np.float32),
(train["u"], train["i"])),
shape=(n_users, n_items),
dtype=np.float32,
)
5 — fit ALS (item × user orientation)
model = AlternatingLeastSquares(factors=FACTORS, regularization=0.01)
model.fit(train_mat.T.tocsr())
6 — sanity-check: rows == model.user_factors
assert train_mat.shape[0] == model.user_factors.shape[0], "row mismatch!"
7 — quick recommender utility
7 — quick recommender utility
def recommend_users(model, mat, user_ids, K=10):
"""user_ids are integer rows in mat
."""
# mat_csr = mat.tocsr() # No need to convert to csr_matrix every time if already csr
recs = {}
for u in user_ids:
# The implicit library will handle the filtering internally
# by looking at the user's row in the provided user_items
matrix
items, _ = model.recommend(
userid=u,
user_items=mat, # <--- Pass the ENTIRE user-item matrix here
N=K,
filter_already_liked_items=True,
)
recs[u] = list(map(int, items))
return recs
8 — generate recs for all training users
all_user_ids = np.arange(train_mat.shape[0])
recs = recommend_users(model, train_mat, all_user_ids, K=TOP_K)
print(f"Got recommendations for {len(recs)} users "
f"(example user 0 → item indices {recs[0][:TOP_K]})")
`