Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 17 additions & 16 deletions recommenders/models/tfidf/tfidf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,12 @@ def __clean_text(self, text, for_BERT=False, verbose=False):
clean = clean.replace("Â\xa0", "") # non-breaking space

# Remove all punctuation and special characters
clean = re.sub("([^\s\w]|_)+", "", clean) # noqa W695 invalid escape sequence '\s'
clean = re.sub(
r"([^\s\w]|_)+", "", clean
) # noqa W695 invalid escape sequence '\s'

# If you want to keep some punctuation, see below commented out example
# clean = re.sub('([^\s\w\-\_\(\)]|_)+','', clean)
# clean = re.sub(r'([^\s\w\-\_\(\)]|_)+','', clean)

# Skip further processing if the text will be used in BERT tokenization
if for_BERT is False:
Expand Down Expand Up @@ -239,13 +241,17 @@ def __create_full_recommendation_dictionary(self, df_clean):
# Similarity measure
cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)

# sorted_idx has the indices that would sort the array.
sorted_idx = np.argsort(cosine_sim, axis=1)

data = list(df_clean[self.id_col].values)
len_df_clean = len(df_clean)

results = {}
for idx, row in df_clean.iterrows():
similar_indices = cosine_sim[idx].argsort()[: -(len(df_clean) + 1) : -1]
similar_items = [
(cosine_sim[idx][i], df_clean[self.id_col][i]) for i in similar_indices
]
results[row[self.id_col]] = similar_items[1:]
for idx, row in zip(range(0, len_df_clean), data):
similar_indices = sorted_idx[idx][: -(len_df_clean + 1) : -1]
similar_items = [(cosine_sim[idx][i], data[i]) for i in similar_indices]
results[row] = similar_items[1:]

# Save to class
self.recommendations = results
Expand All @@ -264,17 +270,12 @@ def __organize_results_as_tabular(self, df_clean, k):
rec_item_id = list()

# For each item
for idx in range(0, len(self.recommendations)):
for _item_id in self.recommendations:
# Information about the item we are basing recommendations off of
rec_based_on = list(self.recommendations.keys())[idx]
tmp_item_id = str(
df_clean.loc[df_clean[self.id_col] == rec_based_on][self.id_col].values[
0
]
)
rec_based_on = tmp_item_id = _item_id

# Get all scores and IDs for items recommended for this current item
rec_array = self.recommendations[rec_based_on]
rec_array = self.recommendations.get(rec_based_on)
tmp_rec_score = list(map(lambda x: x[0], rec_array))
tmp_rec_id = list(map(lambda x: x[1], rec_array))

Expand Down