diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml.disabled similarity index 100% rename from .github/workflows/docker-publish.yml rename to .github/workflows/docker-publish.yml.disabled diff --git a/.github/workflows/inclusive-tests-preps-toys-main.yml b/.github/workflows/inclusive-tests-preps-toys-main.yml new file mode 100644 index 0000000..ee25157 --- /dev/null +++ b/.github/workflows/inclusive-tests-preps-toys-main.yml @@ -0,0 +1,47 @@ +name: inclusive-tests-preps-toys-main + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test-prep-configs: + runs-on: ubuntu-latest + + strategy: + matrix: + include: + - config: '+id: 000 cmd=prep args.data =./data/mams/test.xml args.output =./output/mams-agg' + - config: '+id: 001 cmd=prep args.data =./data/raw/twitter/acl-14-short-data/toy.raw args.output =./output/twitter-agg' + - config: '+id: 002 cmd=prep args.data =./data/raw/semeval/toy.2016SB5/ABSA16_Restaurants_Train_SB1_v2.xml args.output =./output/semeval-agg' + + fail-fast: false + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Install conda + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: '3.8' + environment-file: environment.yml + activate-environment: lady-env + + - name: Install pip requirements + run: | + source $CONDA/bin/activate lady-env + pip install -r requirements.txt + + - name: Run prep with Hydra config + run: | + source $CONDA/bin/activate lady-env + python main.py cmd=prep ${{ matrix.hydra_config }} + working-directory: ./src diff --git a/requirements.txt b/requirements.txt index d240411..8a752ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pandas==1.4.4 seaborn spacy==3.7.2 typing_extensions==4.4.0 -tqdm==4.64.1 +tqdm==4.66.1 natsort scikit-learn>=1.0.0 nltk==3.7 @@ -11,7 +11,7 @@ nltk==3.7 # python -m spacy download en_core_web_trf # python -m spacy download en_core_web_sm # en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl -transformers==4.26.1 +transformers>=4.41.0 torch==1.13.1 # backtranslation-related modules @@ -42,4 +42,4 @@ more-itertools==10.1.0 # needed for implicit dataset generation openai pytest -python-dotenv \ No newline at end of file +python-dotenv diff --git a/src/cmn/mams.py b/src/cmn/mams.py index aeda095..f2461e9 100644 --- a/src/cmn/mams.py +++ b/src/cmn/mams.py @@ -8,7 +8,7 @@ class MAMSReview(Review): def __init__(self, id, sentences, time, author, aos): super().__init__(self, id, sentences, time, author, aos) @staticmethod - def load(path, explicit=False, implicit=True): return MAMSReview._xmlloader(path, explicit, implicit) + def load(path, explicit=True, implicit=False): return MAMSReview._xmlloader(path, explicit, implicit) @staticmethod def _xmlloader(path, explicit, implicit): diff --git a/src/config.yaml b/src/config.yaml index 4ab0efc..8291353 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -11,4 +11,17 @@ prep: nllb: facebook/nllb-200-distilled-600M max_l: 1500 device: ${oc.env:CUDA_VISIBLE_DEVICES, "cpu"} # Use CUDA if available, esle CPU - batch: true \ No newline at end of file + batch: true + +args: # required for 'prep' step + data: + #./data/raw/twitter/acl-14-short-data/toy.raw + ./data/raw/mams/test.xml + #./data/raw/semeval/toy.2016SB5/ABSA16_Restaurants_Train_SB1_v2.xml + + output: + #./output/twitter-agg + ./output/mams-agg + #./output/semeval-agg + + diff --git a/src/main.py b/src/main.py index 152d56d..0915c3d 100644 --- a/src/main.py +++ b/src/main.py @@ -49,7 +49,7 @@ def load(input, output, cfg, cache=True): # from cmn.mams import MAMSReview print("No specific dataset ('semeval' or 'twitter' or 'mams') was detected in the input.") print(f'(#reviews: {len(reviews)})') - print(f'\n1.2. Augmentation via backtranslation by {params.settings["prep"]["langaug"]} {"in batches" if params.settings["prep"] else ""}...') + print(f'\n1.2. Augmentation via backtranslation by {cfg.prep.languag} {"in batches" if params.settings["prep"] else ""}...') for lang in cfg.prep.languag: if lang: print(f'\n{lang} ...') @@ -239,69 +239,70 @@ def agg(path, output): def parse_args(): parser = argparse.ArgumentParser(description='Latent Aspect Detection') parser.add_argument('-am', type=str.lower, default='rnd', help='aspect modeling method (eg. --am lda)') - parser.add_argument('-data', dest='data', type=str, default='/Users/karanveersinghsidhu/StudyD/UOW Classes/Summer 2025 Drop Semester/Fani-Lab/LADy/data/raw/mams/TOCheck.xml', help='raw dataset file path, e.g., -data ..data/raw/semeval/2016SB5/ABSA16_Restaurants_Train_SB1_v2.xml') - parser.add_argument('-output', dest='output', type=str, default='/Users/karanveersinghsidhu/StudyD/UOW Classes/Summer 2025 Drop Semester/Fani-Lab/LADy/output/mams-agg', help='output path, e.g., -output ../output/semeval/2016.xml') + parser.add_argument('-data', dest='data', type=str, default='./data/raw/mams/test.xml', help='raw dataset file path, e.g., -data ..data/raw/semeval/2016SB5/ABSA16_Restaurants_Train_SB1_v2.xml') + parser.add_argument('-output', dest='output', type=str, default='./output/mams-agg', help='output path, e.g., -output ../output/semeval/2016.xml') parser.add_argument('-naspects', dest='naspects', type=int, default=25, help='user-defined number of aspects, e.g., -naspect 25') return parser.parse_args() -@hydra.main(config_path=".", config_name="config") +@hydra.main(version_base=None, config_path=".", config_name="config") def main(cfg: DictConfig): args = parse_args() if 'prep' in cfg.cmd: - if not os.path.isdir(args.output): os.makedirs(args.output) + if not os.path.isdir(cfg.args.output): os.makedirs(cfg.args.output) langaug_str = '.'.join([l for l in cfg.prep.languag if l]) - reviews = load(args.data, f'{args.output}/reviews.{langaug_str}.pkl'.replace('..pkl', '.pkl'), cfg) - splits = split(len(reviews), args.output) - output = f'{args.output}/{args.naspects}.{langaug_str}'.rstrip('.') - - am = None - - if not os.path.isdir(output): os.makedirs(output) - if 'rnd' == args.am: from aml.rnd import Rnd; am = Rnd(args.naspects, params.settings['train']['nwords']) - if 'lda' == args.am: from aml.lda import Lda; am = Lda(args.naspects, params.settings['train']['nwords']) - if 'btm' == args.am: from aml.btm import Btm; am = Btm(args.naspects, params.settings['train']['nwords']) - if 'ctm' == args.am: from aml.ctm import Ctm; am = Ctm(args.naspects, params.settings['train']['nwords'], params.settings['train']['ctm']['contextual_size'], params.settings['train']['ctm']['num_samples']) - if 'bert' == args.am: from aml.bert import BERT; am = BERT(args.naspects, params.settings['train']['nwords']) - if 'fast' == args.am: from aml.fast import Fast; am = Fast(args.naspects, params.settings['train']['nwords']) - if 'octis.ctm' == args.am: from octis.models.CTM import CTM; from aml.nrl import Nrl; am = Nrl(CTM(), args.naspects, params.settings['train']['nwords'], params.settings['train']['quality']) - if 'octis.neurallda' == args.am: from octis.models.NeuralLDA import NeuralLDA; from aml.nrl import Nrl; am = Nrl(NeuralLDA(), args.naspects, params.settings['train']['nwords'], params.settings['train']['quality']) - - - if(am is None): raise Exception('Model not found!') - - output = f'{output}/{am.name()}/' - - eval_for: ModelCapabilities = set(params.settings['eval']['for']).intersection(am.capabilities) #type: ignore - train_for: ModelCapabilities = set(params.settings['train']['for']).intersection(am.capabilities) #type: ignore - - if 'train' in params.settings['cmd']: - for capability in train_for: - for f in splits['folds'].keys(): - t_s = time.time() - reviews_train = np.array(reviews)[splits['folds'][f]['train']].tolist() - reviews_train.extend([r_.augs[lang][1] for r_ in reviews_train for lang in params.settings['prep']['langaug'] if lang and r_.augs[lang][2] >= params.settings['train']['langaug_semsim']]) - train(args, am, reviews_train, np.array(reviews)[splits['folds'][f]['valid']].tolist(), f, output, capability) - print(f'Trained time elapsed including language augs {params.settings["prep"]["langaug"]}: {time.time() - t_s}') - - # testing - if 'test' in params.settings['cmd']: - for capability in eval_for: - for f in splits['folds'].keys(): - test(am, np.array(reviews)[splits['test']].tolist(), f, output, capability) - - # evaluating - if 'eval' in params.settings['cmd']: - for capability in eval_for: - print(f'Evaluating for {am.name} on {capability}') - - cp_name = get_capability_short_name(capability) - - df_f_means = pd.DataFrame() - for f in splits['folds'].keys(): - input = f'{output}f{f}.model.{cp_name}.pred.{params.settings["test"]["h_ratio"]}' - df_mean = evaluate(input, f'{input}.{cp_name}.eval.mean.csv', capability) - df_f_means = pd.concat([df_f_means, df_mean], axis=1) - df_f_means.mean(axis=1).to_frame('mean').to_csv(f'{output}model.{cp_name}.pred.eval.mean.{params.settings["test"]["h_ratio"]}.csv') + reviews = load(cfg.args.data, f'{cfg.args.output}/reviews.{langaug_str}.pkl'.replace('..pkl', '.pkl'), cfg) + splits = split(len(reviews), cfg.args.output) + output = f'{cfg.args.output}/{args.naspects}.{langaug_str}'.rstrip('.') + + if any(x in cfg.cmd for x in ['train', 'test', 'eval']): + am = None + + if not os.path.isdir(output): os.makedirs(output) + if 'rnd' == args.am: from aml.rnd import Rnd; am = Rnd(args.naspects, params.settings['train']['nwords']) + if 'lda' == args.am: from aml.lda import Lda; am = Lda(args.naspects, params.settings['train']['nwords']) + if 'btm' == args.am: from aml.btm import Btm; am = Btm(args.naspects, params.settings['train']['nwords']) + if 'ctm' == args.am: from aml.ctm import Ctm; am = Ctm(args.naspects, params.settings['train']['nwords'], params.settings['train']['ctm']['contextual_size'], params.settings['train']['ctm']['num_samples']) + if 'bert' == args.am: from aml.bert import BERT; am = BERT(args.naspects, params.settings['train']['nwords']) + if 'fast' == args.am: from aml.fast import Fast; am = Fast(args.naspects, params.settings['train']['nwords']) + if 'octis.ctm' == args.am: from octis.models.CTM import CTM; from aml.nrl import Nrl; am = Nrl(CTM(), args.naspects, params.settings['train']['nwords'], params.settings['train']['quality']) + if 'octis.neurallda' == args.am: from octis.models.NeuralLDA import NeuralLDA; from aml.nrl import Nrl; am = Nrl(NeuralLDA(), args.naspects, params.settings['train']['nwords'], params.settings['train']['quality']) + + + if(am is None): raise Exception('Model not found!') + + output = f'{output}/{am.name()}/' + + eval_for: ModelCapabilities = set(params.settings['eval']['for']).intersection(am.capabilities) #type: ignore + train_for: ModelCapabilities = set(params.settings['train']['for']).intersection(am.capabilities) #type: ignore + + if 'train' in cfg.cmd: + for capability in train_for: + for f in splits['folds'].keys(): + t_s = time.time() + reviews_train = np.array(reviews)[splits['folds'][f]['train']].tolist() + reviews_train.extend([r_.augs[lang][1] for r_ in reviews_train for lang in params.settings['prep']['langaug'] if lang and r_.augs[lang][2] >= params.settings['train']['langaug_semsim']]) + train(args, am, reviews_train, np.array(reviews)[splits['folds'][f]['valid']].tolist(), f, output, capability) + print(f'Trained time elapsed including language augs {cfg.prep.languag}: {time.time() - t_s}') + + # testing + if 'test' in cfg.cmd: + for capability in eval_for: + for f in splits['folds'].keys(): + test(am, np.array(reviews)[splits['test']].tolist(), f, output, capability) + + # evaluating + if 'eval' in cfg.cmd: + for capability in eval_for: + print(f'Evaluating for {am.name} on {capability}') + + cp_name = get_capability_short_name(capability) + + df_f_means = pd.DataFrame() + for f in splits['folds'].keys(): + input = f'{output}f{f}.model.{cp_name}.pred.{params.settings["test"]["h_ratio"]}' + df_mean = evaluate(input, f'{input}.{cp_name}.eval.mean.csv', capability) + df_f_means = pd.concat([df_f_means, df_mean], axis=1) + df_f_means.mean(axis=1).to_frame('mean').to_csv(f'{output}model.{cp_name}.pred.eval.mean.{params.settings["test"]["h_ratio"]}.csv') # {CUDA_VISIBLE_DEVICES=0,1} won't work https://discuss.pytorch.org/t/using-torch-data-prallel-invalid-device-string/166233 # TOKENIZERS_PARALLELISM=true @@ -313,4 +314,4 @@ def main(cfg: DictConfig): if __name__ == '__main__': main() - #if 'agg' in params.settings['cmd']: agg(args.output, args.output) \ No newline at end of file + #if 'agg' in params.settings['cmd']: agg(cfg.args.output, cfg.args.output) \ No newline at end of file