diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 2357894aa..a8ab5c434 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - os: [windows-latest, ubuntu-22.04, macos-latest] + os: [windows-latest, ubuntu-22.04, macos-14] python-version: ["3.9", "3.10", "3.11", "3.12"] env: LIMIT_NUMPY_VERSION: 2.0.0 diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 9fded091b..270ad2b77 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -22,7 +22,7 @@ jobs: strategy: fail-fast: false matrix: - os: [windows-latest, ubuntu-22.04, macos-latest] + os: [windows-latest, ubuntu-22.04, macos-14] python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 diff --git a/cornac/models/hypar/README.md b/cornac/models/hypar/README.md new file mode 100644 index 000000000..bd6c74d33 --- /dev/null +++ b/cornac/models/hypar/README.md @@ -0,0 +1,19 @@ +# HypAR changes +We've had to make some changes to the HypAR model to ensure compatibility with numpy 2.x. +The main change is replacing the gensim Word2Vec logic with `sentence-transformers`, +which provides high-quality embeddings and is compatible with numpy 2.x. We therefore do not learn embeddings from +data anymore, but use a pre-trained model instead. +Furthermore, we've updated the requirements file to accomodate the new version. + +To validate the new implementation, we ran the original experiments on the Cellphone and Computer datasets. +The table below shows the results before and after the changes. We observe that these changes do slightly affect the +performance. If you want to use the original implementation, use an older version of Cornac (before v2.3.0). + + +| Dataset | Model Version | AUC | MAP | NDCG | +|-----------|---------------|------------|------------|----------| +| Cellphone | Original | **0.7533** | 0.0517 | 0.2054 | +| | Updated | 0.7493 | **0.0597** | **0.2124** | +| Computer | Original | **0.7278** | 0.0194 | **0.1473** | +| | Updated | 0.7214 | **0.0201** | 0.1462 | + diff --git a/cornac/models/hypar/dgl_utils.py b/cornac/models/hypar/dgl_utils.py index 2d02e790a..0ca750fc5 100644 --- a/cornac/models/hypar/dgl_utils.py +++ b/cornac/models/hypar/dgl_utils.py @@ -249,11 +249,11 @@ def _generate(self, g, eids, canonical_etype): def stem_fn(x): - from gensim.parsing import stem_text - + from nltk.stem import PorterStemmer + stemmer = PorterStemmer() # Remove special characters and numbers. Multiple dashes, single quotes, and equal signs, and similar special chars. - return stem_text(re.sub(r'--+.*|-+$|\+\+|\'.+|=+.*$|-\d.*', '', x)) - + cleaned = re.sub(r'--+.*|-+$|\+\+|\'.+|=+.*$|-\d.*', '', x) + return stemmer.stem(cleaned.lower()) def stem(sentiment): ao_preprocess_fn = stem_fn diff --git a/cornac/models/hypar/hypar.py b/cornac/models/hypar/hypar.py index cc38dbe8f..699cc9278 100644 --- a/cornac/models/hypar/hypar.py +++ b/cornac/models/hypar/hypar.py @@ -946,6 +946,8 @@ def inference(self, node_review_graph, ui_graph, device, batch_size): # Node preference embedding if self.preference_module == 'lightgcn': + # Move ui_graph to the same device as the model to avoid device mismatch + ui_graph = ui_graph.to(device) u, i, _ = self.lightgcn(ui_graph) x = {'user': u, 'item': i} else: diff --git a/cornac/models/hypar/recom_hypar.py b/cornac/models/hypar/recom_hypar.py index b8ffa0301..0f422b498 100644 --- a/cornac/models/hypar/recom_hypar.py +++ b/cornac/models/hypar/recom_hypar.py @@ -411,97 +411,52 @@ def _graph_wrapper(self, train_set, graph_type, *args): def _ao_embeddings(self, train_set): """ - Learn aspect and opinion embeddings using word2vec. + Learn aspect and opinion embeddings using sentence-transformers. Parameters ---------- train_set: dataset Dataset to use for learning embeddings. Returns ------- - Aspect and opinion embeddings, and word2vec model. + Aspect and opinion embeddings, and the sentence-transformers model. """ from .dgl_utils import generate_mappings, stem_fn - from gensim.models import Word2Vec - from gensim.parsing import remove_stopwords, preprocess_string, stem_text from nltk.tokenize import word_tokenize from tqdm import tqdm import numpy as np + from sentence_transformers import SentenceTransformer sentiment = train_set.sentiment - - # Define preprocess functions for text, aspects and opinions. preprocess_fn = stem_fn - # Process corpus, getting all sentences and words. - corpus = [] - for review in tqdm(train_set.review_text.corpus, desc='Processing text', disable=not self.verbose): - for sentence in review.split('.'): - words = word_tokenize(sentence.replace(' n\'t ', 'n ').replace('/', ' ')) - corpus.append(' '.join(preprocess_fn(word) for word in words)) - - # Process words to match with aos extraction methodology used in SEER. + # Prepare aspect and opinion terms a_old_new_map = {a: preprocess_fn(a) for a in sentiment.aspect_id_map} o_old_new_map = {o: preprocess_fn(o) for o in sentiment.opinion_id_map} - - # Generate mappings for aspect and opinion ids. _, _, _, _, _, _, a2a, o2o = generate_mappings(train_set.sentiment, 'a', get_ao_mappings=True) - # Define a progressbar for training word2vec as no information is displayed without. - class CallbackProgressBar: - def __init__(self, verbose): - self.verbose = verbose - self.progress = None - - def on_train_begin(self, method): - if self.progress is None: - self.progress = tqdm(desc='Training Word2Vec', total=method.epochs, disable=not self.verbose) - - def on_train_end(self, method): - pass - - def on_epoch_begin(self, method): - pass + # Load sentence-transformers model (use a small, fast model by default) + model = SentenceTransformer('all-MiniLM-L6-v2') + embedding_dim = model.get_sentence_embedding_dimension() - def on_epoch_end(self, method): - self.progress.update(1) - - # Split words on space and get all unique words - wc = [s.split(' ') for s in corpus] - all_words = set(s for se in wc for s in se) - - # Assert all aspects and opinions in dataset are in corpus. If not, print missing words. - # New datasets may require more preprocessing. - assert all([a in all_words for a in a_old_new_map.values()]), [a for a in a_old_new_map.values() if - a not in all_words] - assert all([o in all_words for o in o_old_new_map.values()]), [o for o in o_old_new_map.values() if - o not in all_words] - - # Train word2vec model using callbacks for progressbar. - l = CallbackProgressBar(self.verbose) - embedding_dim = 100 - w2v_model = Word2Vec(wc, vector_size=embedding_dim, min_count=1, window=5, callbacks=[l], epochs=100) - - # Keyvector model - kv = w2v_model.wv + # Encode all unique aspect and opinion terms + aspect_terms = [a_old_new_map[a] for a in sentiment.aspect_id_map] + opinion_terms = [o_old_new_map[o] for o in sentiment.opinion_id_map] + aspect_vecs = model.encode(aspect_terms, show_progress_bar=self.verbose) + opinion_vecs = model.encode(opinion_terms, show_progress_bar=self.verbose) # Initialize embeddings a_embeddings = np.zeros((len(set(a2a.values())), embedding_dim)) o_embeddings = np.zeros((len(set(o2o.values())), embedding_dim)) - # Define function for assigning embeddings to correct aspect. - def get_info(old_new_pairs, mapping, embedding): - for old, new in old_new_pairs: - nid = mapping(old) - vector = np.array(kv.get_vector(new)) - embedding[nid] = vector - - return embedding - - # Assign embeddings to correct aspect and opinion. - a_embeddings = get_info(a_old_new_map.items(), lambda x: a2a[sentiment.aspect_id_map[x]], a_embeddings) - o_embeddings = get_info(o_old_new_map.items(), lambda x: o2o[sentiment.opinion_id_map[x]], o_embeddings) + # Assign embeddings to correct aspect and opinion + for idx, a in enumerate(sentiment.aspect_id_map): + nid = a2a[sentiment.aspect_id_map[a]] + a_embeddings[nid] = aspect_vecs[idx] + for idx, o in enumerate(sentiment.opinion_id_map): + nid = o2o[sentiment.opinion_id_map[o]] + o_embeddings[nid] = opinion_vecs[idx] - return a_embeddings, o_embeddings, kv + return a_embeddings, o_embeddings, model def _normalize_embedding(self, embedding): """ @@ -550,8 +505,10 @@ def _learn_initial_ao_embeddings(self, train_set): return torch.tensor(a_embeddings), torch.tensor(o_embeddings) def fit(self, train_set: Dataset, val_set=None): + import os import torch from .lightgcn import construct_graph + os.environ['TOKENIZERS_PARALLELISM'] = 'false' # Initialize self variables super().fit(train_set, val_set) diff --git a/cornac/models/hypar/requirements.txt b/cornac/models/hypar/requirements.txt index 173bfa83b..8c457c7e5 100644 --- a/cornac/models/hypar/requirements.txt +++ b/cornac/models/hypar/requirements.txt @@ -1,9 +1,12 @@ # Links for torch and dgl -f https://download.pytorch.org/whl/torch_stable.html +-f https://data.dgl.ai/wheels/torch-2.3/repo.html -pandas==1.4.* -gensim==4.2.0 +pandas==2.2.3 +scikit-learn>=1.0.0 +nltk>=3.6 sentence-transformers==2.2.2 -dgl==1.0.* -torch==1.* -filelock==3.8.2 \ No newline at end of file +dgl==2.4.0 +torch==2.3.* +filelock==3.8.2 +huggingface_hub>=0.10.0,<0.16.0 \ No newline at end of file diff --git a/cornac/models/hypar/requirements_cu116.txt b/cornac/models/hypar/requirements_cu116.txt deleted file mode 100644 index 6e4dad2b1..000000000 --- a/cornac/models/hypar/requirements_cu116.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Links for torch and dgl --f https://download.pytorch.org/whl/torch_stable.html --f https://data.dgl.ai/wheels/cu116/repo.html - -pandas==1.4.* -gensim==4.2.0 -sentence-transformers==2.2.2 -dgl==1.0.* -torch==1.13.1+cu116 -filelock==3.8.2 \ No newline at end of file diff --git a/cornac/models/hypar/requirements_cu118.txt b/cornac/models/hypar/requirements_cu118.txt new file mode 100644 index 000000000..edbe72be0 --- /dev/null +++ b/cornac/models/hypar/requirements_cu118.txt @@ -0,0 +1,14 @@ +# Links for torch and dgl + +-f https://data.dgl.ai/wheels/torch-2.3/cu118/repo.html + +pandas==2.2.3 +scikit-learn>=1.0.0 +nltk>=3.6 +sentence-transformers==2.2.2 +dgl==2.4.0+cu118 +filelock==3.8.2 +huggingface_hub>=0.10.0,<0.16.0 + +--index-url https://download.pytorch.org/whl/cu118 +torch==2.3.* \ No newline at end of file