Skip to content

Assertion error (matrix transpose, user-item, item-user matrix) #738

@levrone1987

Description

@levrone1987

I keep getting the assertion error, and I don't understand why. I train on the user-matrix, and all seems well. But the number of users still seems to be the number of items, which is wrong.

Here is the code:
`import pandas as pd, numpy as np, warnings
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import implicit # v0.6+ recommended
warnings.filterwarnings("ignore", category=UserWarning)

──────────────────────────────────────────────────────────────

CONFIG

──────────────────────────────────────────────────────────────

DATA_CSV = "../data/orders_data.csv"
USER_COL = "_id"
ITEM_COL = "lead"
TIME_COL = "order_datetime"
MIN_INTER = 1
TEST_DAYS = 7
FACTORS = 64
TOP_K = 10

──────────────────────────────────────────────────────────────

df = pd.read_csv(DATA_CSV)
df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")
df = df[df[TIME_COL].notna()].sort_values(TIME_COL)

implicit rating

df["rating"] = 1.0

1 — filter users with too few interactions (whole window)

valid_users = df[USER_COL].value_counts().loc[lambda s: s >= MIN_INTER].index
df = df[df[USER_COL].isin(valid_users)]

2 — chronological split

cutoff = df[TIME_COL].max() - pd.Timedelta(days=TEST_DAYS)
train = df[df[TIME_COL] < cutoff].copy()
test = df[df[TIME_COL] >= cutoff].copy()

3 — build maps on the TRAIN slice only

user2idx = {u: i for i, u in enumerate(train[USER_COL].unique())}
item2idx = {i: j for j, i in enumerate(train[ITEM_COL].unique())}

train["u"] = train[USER_COL].map(user2idx)
train["i"] = train[ITEM_COL].map(item2idx)

4 — build user × item CSR with NO empty rows/cols

n_users, n_items = len(user2idx), len(item2idx)
train_mat = csr_matrix(
(train["rating"].astype(np.float32),
(train["u"], train["i"])),
shape=(n_users, n_items),
dtype=np.float32,
)

5 — fit ALS (item × user orientation)

model = AlternatingLeastSquares(factors=FACTORS, regularization=0.01)
model.fit(train_mat.T.tocsr())

6 — sanity-check: rows == model.user_factors

assert train_mat.shape[0] == model.user_factors.shape[0], "row mismatch!"

7 — quick recommender utility

7 — quick recommender utility

def recommend_users(model, mat, user_ids, K=10):
"""user_ids are integer rows in mat."""
# mat_csr = mat.tocsr() # No need to convert to csr_matrix every time if already csr
recs = {}
for u in user_ids:
# The implicit library will handle the filtering internally
# by looking at the user's row in the provided user_items matrix
items, _ = model.recommend(
userid=u,
user_items=mat, # <--- Pass the ENTIRE user-item matrix here
N=K,
filter_already_liked_items=True,
)
recs[u] = list(map(int, items))
return recs

8 — generate recs for all training users

all_user_ids = np.arange(train_mat.shape[0])
recs = recommend_users(model, train_mat, all_user_ids, K=TOP_K)

print(f"Got recommendations for {len(recs)} users "
f"(example user 0 → item indices {recs[0][:TOP_K]})")
`

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions