Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-22.04, macos-latest]
os: [windows-latest, ubuntu-22.04, macos-14]
python-version: ["3.9", "3.10", "3.11", "3.12"]
env:
LIMIT_NUMPY_VERSION: 2.0.0
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-22.04, macos-latest]
os: [windows-latest, ubuntu-22.04, macos-14]
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
Expand Down
19 changes: 19 additions & 0 deletions cornac/models/hypar/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# HypAR changes
We've had to make some changes to the HypAR model to ensure compatibility with numpy 2.x.
The main change is replacing the gensim Word2Vec logic with `sentence-transformers`,
which provides high-quality embeddings and is compatible with numpy 2.x. We therefore do not learn embeddings from
data anymore, but use a pre-trained model instead.
Furthermore, we've updated the requirements file to accomodate the new version.

To validate the new implementation, we ran the original experiments on the Cellphone and Computer datasets.
The table below shows the results before and after the changes. We observe that these changes do slightly affect the
performance. If you want to use the original implementation, use an older version of Cornac (before v2.3.0).


| Dataset | Model Version | AUC | MAP | NDCG |
|-----------|---------------|------------|------------|----------|
| Cellphone | Original | **0.7533** | 0.0517 | 0.2054 |
| | Updated | 0.7493 | **0.0597** | **0.2124** |
| Computer | Original | **0.7278** | 0.0194 | **0.1473** |
| | Updated | 0.7214 | **0.0201** | 0.1462 |

8 changes: 4 additions & 4 deletions cornac/models/hypar/dgl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,11 @@ def _generate(self, g, eids, canonical_etype):


def stem_fn(x):
from gensim.parsing import stem_text

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
# Remove special characters and numbers. Multiple dashes, single quotes, and equal signs, and similar special chars.
return stem_text(re.sub(r'--+.*|-+$|\+\+|\'.+|=+.*$|-\d.*', '', x))

cleaned = re.sub(r'--+.*|-+$|\+\+|\'.+|=+.*$|-\d.*', '', x)
return stemmer.stem(cleaned.lower())

def stem(sentiment):
ao_preprocess_fn = stem_fn
Expand Down
2 changes: 2 additions & 0 deletions cornac/models/hypar/hypar.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,6 +946,8 @@ def inference(self, node_review_graph, ui_graph, device, batch_size):

# Node preference embedding
if self.preference_module == 'lightgcn':
# Move ui_graph to the same device as the model to avoid device mismatch
ui_graph = ui_graph.to(device)
u, i, _ = self.lightgcn(ui_graph)
x = {'user': u, 'item': i}
else:
Expand Down
87 changes: 22 additions & 65 deletions cornac/models/hypar/recom_hypar.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,97 +411,52 @@ def _graph_wrapper(self, train_set, graph_type, *args):

def _ao_embeddings(self, train_set):
"""
Learn aspect and opinion embeddings using word2vec.
Learn aspect and opinion embeddings using sentence-transformers.
Parameters
----------
train_set: dataset
Dataset to use for learning embeddings.
Returns
-------
Aspect and opinion embeddings, and word2vec model.
Aspect and opinion embeddings, and the sentence-transformers model.
"""
from .dgl_utils import generate_mappings, stem_fn
from gensim.models import Word2Vec
from gensim.parsing import remove_stopwords, preprocess_string, stem_text
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer

sentiment = train_set.sentiment

# Define preprocess functions for text, aspects and opinions.
preprocess_fn = stem_fn

# Process corpus, getting all sentences and words.
corpus = []
for review in tqdm(train_set.review_text.corpus, desc='Processing text', disable=not self.verbose):
for sentence in review.split('.'):
words = word_tokenize(sentence.replace(' n\'t ', 'n ').replace('/', ' '))
corpus.append(' '.join(preprocess_fn(word) for word in words))

# Process words to match with aos extraction methodology used in SEER.
# Prepare aspect and opinion terms
a_old_new_map = {a: preprocess_fn(a) for a in sentiment.aspect_id_map}
o_old_new_map = {o: preprocess_fn(o) for o in sentiment.opinion_id_map}

# Generate mappings for aspect and opinion ids.
_, _, _, _, _, _, a2a, o2o = generate_mappings(train_set.sentiment, 'a', get_ao_mappings=True)

# Define a progressbar for training word2vec as no information is displayed without.
class CallbackProgressBar:
def __init__(self, verbose):
self.verbose = verbose
self.progress = None

def on_train_begin(self, method):
if self.progress is None:
self.progress = tqdm(desc='Training Word2Vec', total=method.epochs, disable=not self.verbose)

def on_train_end(self, method):
pass

def on_epoch_begin(self, method):
pass
# Load sentence-transformers model (use a small, fast model by default)
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_dim = model.get_sentence_embedding_dimension()

def on_epoch_end(self, method):
self.progress.update(1)

# Split words on space and get all unique words
wc = [s.split(' ') for s in corpus]
all_words = set(s for se in wc for s in se)

# Assert all aspects and opinions in dataset are in corpus. If not, print missing words.
# New datasets may require more preprocessing.
assert all([a in all_words for a in a_old_new_map.values()]), [a for a in a_old_new_map.values() if
a not in all_words]
assert all([o in all_words for o in o_old_new_map.values()]), [o for o in o_old_new_map.values() if
o not in all_words]

# Train word2vec model using callbacks for progressbar.
l = CallbackProgressBar(self.verbose)
embedding_dim = 100
w2v_model = Word2Vec(wc, vector_size=embedding_dim, min_count=1, window=5, callbacks=[l], epochs=100)

# Keyvector model
kv = w2v_model.wv
# Encode all unique aspect and opinion terms
aspect_terms = [a_old_new_map[a] for a in sentiment.aspect_id_map]
opinion_terms = [o_old_new_map[o] for o in sentiment.opinion_id_map]
aspect_vecs = model.encode(aspect_terms, show_progress_bar=self.verbose)
opinion_vecs = model.encode(opinion_terms, show_progress_bar=self.verbose)

# Initialize embeddings
a_embeddings = np.zeros((len(set(a2a.values())), embedding_dim))
o_embeddings = np.zeros((len(set(o2o.values())), embedding_dim))

# Define function for assigning embeddings to correct aspect.
def get_info(old_new_pairs, mapping, embedding):
for old, new in old_new_pairs:
nid = mapping(old)
vector = np.array(kv.get_vector(new))
embedding[nid] = vector

return embedding

# Assign embeddings to correct aspect and opinion.
a_embeddings = get_info(a_old_new_map.items(), lambda x: a2a[sentiment.aspect_id_map[x]], a_embeddings)
o_embeddings = get_info(o_old_new_map.items(), lambda x: o2o[sentiment.opinion_id_map[x]], o_embeddings)
# Assign embeddings to correct aspect and opinion
for idx, a in enumerate(sentiment.aspect_id_map):
nid = a2a[sentiment.aspect_id_map[a]]
a_embeddings[nid] = aspect_vecs[idx]
for idx, o in enumerate(sentiment.opinion_id_map):
nid = o2o[sentiment.opinion_id_map[o]]
o_embeddings[nid] = opinion_vecs[idx]

return a_embeddings, o_embeddings, kv
return a_embeddings, o_embeddings, model

def _normalize_embedding(self, embedding):
"""
Expand Down Expand Up @@ -550,8 +505,10 @@ def _learn_initial_ao_embeddings(self, train_set):
return torch.tensor(a_embeddings), torch.tensor(o_embeddings)

def fit(self, train_set: Dataset, val_set=None):
import os
import torch
from .lightgcn import construct_graph
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Initialize self variables
super().fit(train_set, val_set)
Expand Down
13 changes: 8 additions & 5 deletions cornac/models/hypar/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# Links for torch and dgl
-f https://download.pytorch.org/whl/torch_stable.html
-f https://data.dgl.ai/wheels/torch-2.3/repo.html

pandas==1.4.*
gensim==4.2.0
pandas==2.2.3
scikit-learn>=1.0.0
nltk>=3.6
sentence-transformers==2.2.2
dgl==1.0.*
torch==1.*
filelock==3.8.2
dgl==2.4.0
torch==2.3.*
filelock==3.8.2
huggingface_hub>=0.10.0,<0.16.0
10 changes: 0 additions & 10 deletions cornac/models/hypar/requirements_cu116.txt

This file was deleted.

14 changes: 14 additions & 0 deletions cornac/models/hypar/requirements_cu118.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Links for torch and dgl

-f https://data.dgl.ai/wheels/torch-2.3/cu118/repo.html

pandas==2.2.3
scikit-learn>=1.0.0
nltk>=3.6
sentence-transformers==2.2.2
dgl==2.4.0+cu118
filelock==3.8.2
huggingface_hub>=0.10.0,<0.16.0

--index-url https://download.pytorch.org/whl/cu118
torch==2.3.*
Loading