diff --git a/.gitignore b/ACL2020/.gitignore similarity index 100% rename from .gitignore rename to ACL2020/.gitignore diff --git a/LICENSE b/ACL2020/LICENSE similarity index 100% rename from LICENSE rename to ACL2020/LICENSE diff --git a/ACL2020/README.md b/ACL2020/README.md new file mode 100644 index 0000000..80f324a --- /dev/null +++ b/ACL2020/README.md @@ -0,0 +1,57 @@ +# Toxicity detection w/ and w/o context +* Concerning comments existing in a thread. +* Context information: + * The parent comment. + * The discussion topic. +* The large dataset (CAT_LARGE) can be found in the `data` folder. + * `gn.csv` comprises the out of context annotations. + * `gc.csv` comprises the in-context annotations. +* The small dataset (CAT_SMALL) is also included. + +### Word embeddings +* You will need to add a folder `embeddings` when using pre-trained embeddings. + * For example, GloVe embeddings. + +### Building the datasets +Create random splits: +>python experiments.py --create_random_splits 10 + +Downsample the two categories (one per dataset) to make the datasets equibalanced while equally sized: +>python experiments.py --create_balanced_datasets + +Then, create 10 random splits: +>python experiments.py --create_random_splits 10 --use_balanced_datasets True + +### Running a classifier + +Run a simple bi-LSTM by: +> nohup python experiments.py --with_context_data False --with_context_model "RNN:OOC" --repeat 10 > rnn.ooc.log & + +* You can train it also in IC data, by changing the related argument. + * If you call "RNN:INC1", the same LSTM will be trained, but another LSTM will encode the parent text (IC data required) and concatenate the two encoded texts before the dense layers on the top. + * If you call "BERT:OOC1" you have a simple BERT. + * If you call "BERT:OOC2" you concatenate the parent text (IC data required) with a SEPARATED token. + * If you call "BERT:CA" you extend BERT:OOC1 with the LSTM encoded parent text, similarly to the RNN:INC1. + +### The article +* Presented at the 58th Annual Meeting of the Association for Computational Linguistics ([link](https://arxiv.org/abs/2006.00998)). + +### How to cite this work: +``` +@inproceedings{pavlopoulos-etal-2020-toxicity, + title = "Toxicity Detection: Does Context Really Matter?", + author = "Pavlopoulos, John and + Sorensen, Jeffrey and + Dixon, Lucas and + Thain, Nithum and + Androutsopoulos, Ion", + booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", + month = jul, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.acl-main.396", + doi = "10.18653/v1/2020.acl-main.396", + pages = "4296--4305", +} +``` \ No newline at end of file diff --git a/classifiers.py b/ACL2020/classifiers.py similarity index 99% rename from classifiers.py rename to ACL2020/classifiers.py index 8137efe..dd1a8de 100644 --- a/classifiers.py +++ b/ACL2020/classifiers.py @@ -2,7 +2,7 @@ from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.layers import Dense from tensorflow.keras.layers import Embedding -from tensorflow.keras.layers import GRU, LSTM, Bidirectional, TimeDistributed +from tensorflow.keras.layers import LSTM, Bidirectional from tensorflow.keras.layers import Input from tensorflow.keras.models import Model from tensorflow.keras.preprocessing import sequence @@ -15,7 +15,7 @@ import tensorflow_hub as hub from tensorflow.keras import backend as K from bert import tokenization -from utils import InputExample, convert_examples_to_features +from ACL2020.utils import InputExample, convert_examples_to_features from sklearn.metrics import * import pickle diff --git a/data/CAT_LARGE/gc.csv b/ACL2020/data/CAT_LARGE/gc.csv similarity index 100% rename from data/CAT_LARGE/gc.csv rename to ACL2020/data/CAT_LARGE/gc.csv diff --git a/data/CAT_LARGE/gn.csv b/ACL2020/data/CAT_LARGE/gn.csv similarity index 100% rename from data/CAT_LARGE/gn.csv rename to ACL2020/data/CAT_LARGE/gn.csv diff --git a/data/CAT_SMALL/__init__.py b/ACL2020/data/CAT_SMALL/__init__.py similarity index 100% rename from data/CAT_SMALL/__init__.py rename to ACL2020/data/CAT_SMALL/__init__.py diff --git a/data/CC_supplementary.pdf b/ACL2020/data/CC_supplementary.pdf similarity index 100% rename from data/CC_supplementary.pdf rename to ACL2020/data/CC_supplementary.pdf diff --git a/experiment.multi.sh b/ACL2020/experiment.multi.sh similarity index 100% rename from experiment.multi.sh rename to ACL2020/experiment.multi.sh diff --git a/experiment.single.balanced.sh b/ACL2020/experiment.single.balanced.sh similarity index 100% rename from experiment.single.balanced.sh rename to ACL2020/experiment.single.balanced.sh diff --git a/experiment.single.standard.sh b/ACL2020/experiment.single.standard.sh similarity index 100% rename from experiment.single.standard.sh rename to ACL2020/experiment.single.standard.sh diff --git a/experiments.py b/ACL2020/experiments.py similarity index 96% rename from experiments.py rename to ACL2020/experiments.py index ef6b164..9b90d0f 100644 --- a/experiments.py +++ b/ACL2020/experiments.py @@ -1,13 +1,12 @@ from sklearn.model_selection import train_test_split import pandas as pd -from absl import flags, logging, app +from absl import flags, app import numpy as np -import classifiers +from ACL2020 import classifiers from sklearn.metrics import * from scipy.stats import sem import tensorflow as tf import os, sys -import json import datetime # Following is a dependency on the ssig package: #! git clone https://github.com/ipavlopoulos/ssig.git @@ -74,9 +73,9 @@ def train(with_context, verbose=1, splits_path="data/CAT_LARGE/MCCV", the_split_ model = classifiers.LSTM_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs) else: if FLAGS.model_name == "RNN:INC1": - model = classifiers.LSTM_IC1_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs, patience=FLAGS.patience) + model = classifiers.LSTM_IC1_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs, patience=FLAGS.patience) elif FLAGS.model_name == "RNN:INC2": - model = classifiers.LSTM_IC2_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs, patience=FLAGS.patience) + model = classifiers.LSTM_IC2_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs, patience=FLAGS.patience) elif "RNN" in FLAGS.model_name: print("Not implemented yet...") else: @@ -85,7 +84,7 @@ def train(with_context, verbose=1, splits_path="data/CAT_LARGE/MCCV", the_split_ lr = 2e-05 if FLAGS.model_name == "BERT:OOC": print("Training BERT with no context mechanism added.") - model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr, epochs=FLAGS.epochs, session=sess) + model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr, epochs=FLAGS.epochs, session=sess) elif FLAGS.model_name == "BERT:INC1": print("Training BERT with parent concatenated to text.") model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr, DATA2_COLUMN="parent", epochs=FLAGS.epochs, session=sess) diff --git a/requirements.txt b/ACL2020/requirements.txt similarity index 100% rename from requirements.txt rename to ACL2020/requirements.txt diff --git a/utils.py b/ACL2020/utils.py similarity index 100% rename from utils.py rename to ACL2020/utils.py diff --git a/data/CCC.csv b/CCC.csv similarity index 100% rename from data/CCC.csv rename to CCC.csv diff --git a/README.md b/README.md index a31f30b..5d45e28 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,48 @@ -# Toxicity detection w/ and w/o context -* Concerning comments existing in a thread. -* Context information: - * The parent comment. - * The discussion topic. -* The large dataset is included in the [data](https://github.com/ipavlopoulos/context_toxicity/tree/master/data) folder in the form of two CSV files. - * `gn.csv` comprises the out of context annotations. - * `gc.csv` comprises the in-context annotations. -* The small dataset will be included soon. - -### Word embeddings -* You will need to add a folder `embeddings` when using pre-trained embeddings. - * For example, GloVe embeddings. - -### Building the datasets -Create random splits: ->python experiments.py --create_random_splits 10 - -Downsample the two categories (one per dataset) to make the datasets equibalanced while equally sized: ->python experiments.py --create_balanced_datasets - -Then, create 10 random splits: ->python experiments.py --create_random_splits 10 --use_balanced_datasets True - -### Running a classifier - -Run a simple bi-LSTM by: -> nohup python experiments.py --with_context_data False --with_context_model "RNN:OOC" --repeat 10 > rnn.ooc.log & - -* You can train it also in IC data, by changing the related argument. - * If you call "RNN:INC1", the same LSTM will be trained, but another LSTM will encode the parent text (IC data required) and concatenate the two encoded texts before the dense layers on the top. - * If you call "BERT:OOC1" you have a simple BERT. - * If you call "BERT:OOC2" you concatenate the parent text (IC data required) with a SEPARATED token. - * If you call "BERT:CA" you extend BERT:OOC1 with the LSTM encoded parent text, similarly to the RNN:INC1. - -The names are messy, but these will hopefully change. - -### The article -* Presented at ACL'20 -* [Link to arXiv](https://arxiv.org/abs/2006.00998) -* Please cite: ->@misc{pavlopoulos2020toxicity, title={Toxicity Detection: Does Context Really Matter?}, author={John Pavlopoulos and Jeffrey Sorensen and Lucas Dixon and Nithum Thain and Ion Androutsopoulos}, year={2020}, eprint={2006.00998}, archivePrefix={arXiv}, primaryClass={cs.CL}} +# The CCC dataset + +The article presenting this dataset is [Context Sensitivity Estimation in Toxicity Detection](https://aclanthology.org/2021.woah-1.15/). + +To build the dataset of this work, we used the publicly available Civil Comments (CC) dataset (Borkan et al., 2019). +CC was originally annotated by ten annotators per post, but the parent post (the previous post in the thread) was not +shown to the annotators. + We call this new dataset Civil Comments in Context (CCC). Each CCC post was rated either as NON-TOXIC, UNSURE, TOXIC, or +VERY TOXIC, as in the original CC dataset. +We unified the latter two labels in both CC and CCC annotations to simplify the problem. In only 71 posts (0.07%) an annotator said UNSURE, meaning annotators were confident +in their decisions most of the time. We exclude these 71 posts from our study, as there are too few +to generalize about. + +The dataset is stored as a CSV (CCC.csv), which contains 8 columns: + +* `id`: the id of the target post on the civil comments platform +* `tox_codes_oc`: the toxic codes given by the annotators whao did not have access to the parent post +* `text`: the target posts +* `toxicity_annotator_count`: the number of the annotators who annotated this post +* `parent`: the parent post +* `tox_codes_ic`: the toxic codes given by the annotators who did have access to the parent post +* `tox_codes_parent`: the toxic codes (out of context) of the parent post +* `workers_ic`: the ids of the annotators on the appen platform + +## Previous versions +* An older version of this dataset was presented at ACL 2020 and it is included in this repository. +* You can read the respective article [here](https://aclanthology.org/2020.acl-main.396/). + +## How to cite this dataset: +``` +@inproceedings{xenos-etal-2021-context, + title = "Context Sensitivity Estimation in Toxicity Detection", + author = "Xenos, Alexandros and + Pavlopoulos, John and + Androutsopoulos, Ion", + booktitle = "Proceedings of the 5th Workshop on Online Abuse and Harms (WOAH 2021)", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.woah-1.15", + doi = "10.18653/v1/2021.woah-1.15", + pages = "140--145", + abstract = "User posts whose perceived toxicity depends on the conversational context are rare in current toxicity detection datasets. Hence, toxicity detectors trained on current datasets will also disregard context, making the detection of context-sensitive toxicity a lot harder when it occurs. We constructed and publicly release a dataset of 10k posts with two kinds of toxicity labels per post, obtained from annotators who considered (i) both the current post and the previous one as context, or (ii) only the current post. We introduce a new task, context-sensitivity estimation, which aims to identify posts whose perceived toxicity changes if the context (previous post) is also considered. Using the new dataset, we show that systems can be developed for this task. Such systems could be used to enhance toxicity detection datasets with more context-dependent posts or to suggest when moderators should consider the parent posts, which may not always be necessary and may introduce additional costs.", +} +``` + + diff --git a/__init__.py b/__init__.py deleted file mode 100644 index 70df1ad..0000000 --- a/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import utils, classifiers \ No newline at end of file diff --git a/data/README.MD b/data/README.MD deleted file mode 100644 index 9a8d1b6..0000000 --- a/data/README.MD +++ /dev/null @@ -1,45 +0,0 @@ -# The Dataset (CCC) - -To build the dataset of this work, we used the publicly available Civil Comments (CC) dataset (Borkan et al., 2019). - -CC was originally annotated by ten annotators per post, but the parent post (the previous post in the thread) was not -shown to the annotators. - We call this new dataset Civil Comments in Context (CCC). Each CCC post was rated either as NON-TOXIC, UNSURE, TOXIC, or -VERY TOXIC, as in the original CC dataset. -We unified the latter two labels in both CC and CCC annotations to simplify the problem. -In only 71 posts (0.07%) an annotator said UNSURE, meaning annotators were confident -in their decisions most of the time. We exclude these 71 posts from our study, as there are too few -to generalize about. - -The dataset is stored as a CSV. The data file contains 8 columns: - -* id = the id of the target post on the civil comments platform -* tox_codes_oc = the toxic codes given by the annotators whao did not have access to the parent post -* text = the target posts -* toxicity_annotator_count = the number of the annotators who annotated this post -* parent = the parent post -* tox_codes_ic = the toxic codes given by the annotators who did have access to the parent post -* tox_codes_parent = the toxic codes (out of context) of the parent post -* workers_ic = the ids of the annotators on the appen platform - - -## How to cite this dataset: -``` -@inproceedings{xenos-etal-2021-context, - title = "Context Sensitivity Estimation in Toxicity Detection", - author = "Xenos, Alexandros and - Pavlopoulos, John and - Androutsopoulos, Ion", - booktitle = "Proceedings of the 5th Workshop on Online Abuse and Harms (WOAH 2021)", - month = aug, - year = "2021", - address = "Online", - publisher = "Association for Computational Linguistics", - url = "https://aclanthology.org/2021.woah-1.15", - doi = "10.18653/v1/2021.woah-1.15", - pages = "140--145", - abstract = "User posts whose perceived toxicity depends on the conversational context are rare in current toxicity detection datasets. Hence, toxicity detectors trained on current datasets will also disregard context, making the detection of context-sensitive toxicity a lot harder when it occurs. We constructed and publicly release a dataset of 10k posts with two kinds of toxicity labels per post, obtained from annotators who considered (i) both the current post and the previous one as context, or (ii) only the current post. We introduce a new task, context-sensitivity estimation, which aims to identify posts whose perceived toxicity changes if the context (previous post) is also considered. Using the new dataset, we show that systems can be developed for this task. Such systems could be used to enhance toxicity detection datasets with more context-dependent posts or to suggest when moderators should consider the parent posts, which may not always be necessary and may introduce additional costs.", -} -``` - -