Skip to content

Extractive #52

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
'autoencoder':None,
'ae_batchsize': 5000,

'density_parameter': .04,
'minimum_samples': 4,
'min_clusters': 5,
'max_acceptable_clusters':30,
'min_num_candidates': 100,
'density_parameter': 2,
'minimum_samples': 2,
'min_clusters': 50,
'max_acceptable_clusters':100,
'min_num_candidates': 200,

'BERT_finetune_path':'bert_finetune/models/finetune_electronics_mae1.pt',
'BERT_config_path': 'bert_finetune/models/finetune_electronics_mae1config.json',
Expand Down
2 changes: 1 addition & 1 deletion extractive/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"minimum_samples": 2,
"min_clusters": 50,
"min_chars_per_sentence": 42,
"min_num_candidates": 500,
"min_num_candidates": 200,
"max_acceptable_clusters": 100 }
64 changes: 51 additions & 13 deletions extractive/extractive.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pickle
from config import config
from helpers import find_clusters, sample, abstractive_clustering, cut_out_shorts
import json

def extractive_encode(list_of_sentences, savePath = None):
"""Featurizes a list of sentences using the indico API"""
Expand Down Expand Up @@ -40,38 +41,75 @@ def abstractive_cluster(features):
if __name__ == "__main__":
import numpy as np
#using test data of most popular review from electronics
text = pickle.load(open("popular_electronics_sentences.p","rb"))
features = pickle.load(open("electronics_popular_features.p","rb"))
text, features = cut_out_shorts(text, features, config)
#text = pickle.load(open("popular_electronics_sentences.p","rb"))
#print("loading text")
#text = json.load(open("data.json","r"))
#print(np.shape(text["0"]))
#print("loading features")
data = pickle.load(open("sample_embeddings.p","rb"))
data = data["B002YU83YO"]
text = data["sentences"]
features = data["embeddings"]
features = np.asarray(features)
text, features = cut_out_shorts(text, features, config)
print(np.shape(text))
print(np.shape(features))
#means = abstractive_cluster(features)
#print(np.shape(means))

means = abstractive_cluster(features)
print(np.shape(means))

print("clustering...")
sentence_labels, num_clusters = extractive_cluster(features)
sentences = extractive_decode(text, sentence_labels, features, num_clusters, config)
print("Number of clusters: " + str(num_clusters))
print("Number of candidates: " + str(len(sentences)))
print(sentences[::len(sentences)//num_clusters])

print(sentences[::len(sentences)//num_clusters])

def cluster(encodings, sentences, config):
if False:
if True:
sentence_labels, num_clusters = find_clusters(encodings, config)
candidate_sentences = sample(sentences, sentence_labels, encodings,
num_clusters, config)
return candidate_sentences
else:
sentence_labels, _ = find_clusters(encodings, config)
sentence_labels, num_clusters = find_clusters(encodings, config)
print("Number of clusters: " + str(num_clusters))
means = []
for cluster in set(sentence_labels):
print("CLUSTER " + str(cluster) + "\n")
if cluster == -1:
continue
pass

cluster_indices = np.where(sentence_labels == cluster)
for i in cluster_indices[0][:10]:
print(sentences[i])
print("\n")
cluster_core_samples = encodings[cluster_indices]
average = np.mean(cluster_core_samples, axis = 0)
means.append(average)
print(len(means))
return means

print(np.shape(cluster(features, [], config)))
print(type(cluster(features, [], config)))
print(type(cluster(features, [], config)[0]))
#print(cluster(features, text, config))
cluster(features, text, config)
#print(np.shape(cluster(features, text, config)))
#print(type(cluster(features, [], config)))
#print(type(cluster(features, [], config)[0]))

"""
B000QUUFRW
B000JE7GPY
B000WL6YY8
B003ES5ZUU
B002YU83YO
B008NMCPTQ
B003LSTD38
B000WYVBR0
B001GTT0VO
B0043WJRRS
B00902SFC4
B00GTGETFG
B00007EDZG
B002TLTGM6
B0088CJT4U
"""
7 changes: 4 additions & 3 deletions extractive/helpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
from math import ceil
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances


def cut_out_shorts(list_of_sentences, features, config):
Expand Down Expand Up @@ -69,11 +69,12 @@ def sample(list_of_sentences, sentence_labels, features, num_clusters, config):
cluster_indices = np.where(sentence_labels == cluster)
cluster_core_samples = features[cluster_indices]
average = np.mean(cluster_core_samples, axis = 0)
distances_from_cluster = cosine_distances(features, average.reshape(1,-1))
distances_from_cluster = euclidean_distances(features, average.reshape(1,-1))
sample_sentence_indices = np.argsort(distances_from_cluster.flatten())[:samples_per_cluster]
for sentence_index in sample_sentence_indices:
candidates.append(list_of_sentences[sentence_index])

print(list_of_sentences[sentence_index])
print("#######")
return candidates


Expand Down