ipavlopoulos · ipavlopoulos · Sep 15, 2021 · Sep 15, 2021
diff --git a/.gitignore → ACL2020/.gitignore b/.gitignore → ACL2020/.gitignore
diff --git a/LICENSE → ACL2020/LICENSE b/LICENSE → ACL2020/LICENSE
diff --git a/ACL2020/README.md b/ACL2020/README.md
@@ -0,0 +1,57 @@
+# Toxicity detection w/ and w/o context
+* Concerning comments existing in a thread.
+* Context information: 
+    * The parent comment.
+    * The discussion topic.
+* The large dataset (CAT_LARGE) can be found in the `data` folder.
+    * `gn.csv` comprises the out of context annotations.
+    * `gc.csv` comprises the in-context annotations.
+* The small dataset (CAT_SMALL) is also included.
+
+### Word embeddings
+* You will need to add a folder `embeddings` when using pre-trained embeddings.
+    * For example, GloVe embeddings.
+
+### Building the datasets
+Create random splits:
+>python experiments.py --create_random_splits 10
+
+Downsample the two categories (one per dataset) to make the datasets equibalanced while equally sized:
+>python experiments.py --create_balanced_datasets
+
+Then, create 10 random splits:
+>python experiments.py --create_random_splits 10 --use_balanced_datasets True
+
+### Running a classifier
+
+Run a simple bi-LSTM by:
+> nohup python experiments.py --with_context_data False --with_context_model "RNN:OOC" --repeat 10 > rnn.ooc.log &
+
+* You can train it also in IC data, by changing the related argument.
+    * If you call "RNN:INC1", the same LSTM will be trained, but another LSTM will encode the parent text (IC data required) and concatenate the two encoded texts before the dense layers on the top.
+    * If you call "BERT:OOC1" you have a simple BERT.
+    * If you call "BERT:OOC2" you concatenate the parent text (IC data required) with a SEPARATED token.
+    * If you call "BERT:CA" you extend BERT:OOC1 with the LSTM encoded parent text, similarly to the RNN:INC1.
+
+### The article
+* Presented at the 58th Annual Meeting of the Association for Computational Linguistics ([link](https://arxiv.org/abs/2006.00998)).
+
+### How to cite this work:
+```
+@inproceedings{pavlopoulos-etal-2020-toxicity,
+    title = "Toxicity Detection: Does Context Really Matter?",
+    author = "Pavlopoulos, John  and
+      Sorensen, Jeffrey  and
+      Dixon, Lucas  and
+      Thain, Nithum  and
+      Androutsopoulos, Ion",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.acl-main.396",
+    doi = "10.18653/v1/2020.acl-main.396",
+    pages = "4296--4305",
+}
+```
diff --git a/classifiers.py → ACL2020/classifiers.py b/classifiers.py → ACL2020/classifiers.py
@@ -2,7 +2,7 @@
 from tensorflow.keras.callbacks import EarlyStopping
 from tensorflow.keras.layers import Dense
 from tensorflow.keras.layers import Embedding
-from tensorflow.keras.layers import GRU, LSTM, Bidirectional, TimeDistributed
+from tensorflow.keras.layers import LSTM, Bidirectional
 from tensorflow.keras.layers import Input
 from tensorflow.keras.models import Model
 from tensorflow.keras.preprocessing import sequence
@@ -15,7 +15,7 @@
 import tensorflow_hub as hub
 from tensorflow.keras import backend as K
 from bert import tokenization
-from utils import InputExample, convert_examples_to_features
+from ACL2020.utils import InputExample, convert_examples_to_features
 from sklearn.metrics import *
 import pickle
 

diff --git a/data/CAT_LARGE/gc.csv → ACL2020/data/CAT_LARGE/gc.csv b/data/CAT_LARGE/gc.csv → ACL2020/data/CAT_LARGE/gc.csv
diff --git a/data/CAT_LARGE/gn.csv → ACL2020/data/CAT_LARGE/gn.csv b/data/CAT_LARGE/gn.csv → ACL2020/data/CAT_LARGE/gn.csv
diff --git a/data/CAT_SMALL/__init__.py → ACL2020/data/CAT_SMALL/__init__.py b/data/CAT_SMALL/__init__.py → ACL2020/data/CAT_SMALL/__init__.py
diff --git a/data/CC_supplementary.pdf → ACL2020/data/CC_supplementary.pdf b/data/CC_supplementary.pdf → ACL2020/data/CC_supplementary.pdf
diff --git a/experiment.multi.sh → ACL2020/experiment.multi.sh b/experiment.multi.sh → ACL2020/experiment.multi.sh
diff --git a/experiment.single.balanced.sh → ACL2020/experiment.single.balanced.sh b/experiment.single.balanced.sh → ACL2020/experiment.single.balanced.sh
diff --git a/experiment.single.standard.sh → ACL2020/experiment.single.standard.sh b/experiment.single.standard.sh → ACL2020/experiment.single.standard.sh
diff --git a/experiments.py → ACL2020/experiments.py b/experiments.py → ACL2020/experiments.py
@@ -1,13 +1,12 @@
 from sklearn.model_selection import train_test_split
 import pandas as pd
-from absl import flags, logging, app
+from absl import flags, app
 import numpy as np
-import classifiers
+from ACL2020 import classifiers
 from sklearn.metrics import *
 from scipy.stats import sem
 import tensorflow as tf
 import os, sys
-import json
 import datetime
 # Following is a dependency on the ssig package:
 #! git clone https://github.com/ipavlopoulos/ssig.git
@@ -74,9 +73,9 @@ def train(with_context, verbose=1, splits_path="data/CAT_LARGE/MCCV", the_split_
             model = classifiers.LSTM_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs)
         else:
             if FLAGS.model_name == "RNN:INC1":
-                model = classifiers.LSTM_IC1_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose,  n_epochs=FLAGS.epochs, patience=FLAGS.patience)
+                model = classifiers.LSTM_IC1_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs, patience=FLAGS.patience)
             elif FLAGS.model_name == "RNN:INC2":
-                model = classifiers.LSTM_IC2_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose,  n_epochs=FLAGS.epochs, patience=FLAGS.patience)
+                model = classifiers.LSTM_IC2_CLF(prefix=FLAGS.model_name.lower(), verbose=verbose, n_epochs=FLAGS.epochs, patience=FLAGS.patience)
             elif "RNN" in FLAGS.model_name:
                 print("Not implemented yet...")
             else:
@@ -85,7 +84,7 @@ def train(with_context, verbose=1, splits_path="data/CAT_LARGE/MCCV", the_split_
                     lr = 2e-05
                     if FLAGS.model_name == "BERT:OOC":
                         print("Training BERT with no context mechanism added.")
-                        model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr,  epochs=FLAGS.epochs, session=sess)
+                        model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr, epochs=FLAGS.epochs, session=sess)
                     elif FLAGS.model_name == "BERT:INC1":
                         print("Training BERT with parent concatenated to text.")
                         model = classifiers.BERT_MLP(patience=FLAGS.patience, lr=lr, DATA2_COLUMN="parent", epochs=FLAGS.epochs, session=sess)

diff --git a/requirements.txt → ACL2020/requirements.txt b/requirements.txt → ACL2020/requirements.txt
diff --git a/utils.py → ACL2020/utils.py b/utils.py → ACL2020/utils.py
diff --git a/data/CCC.csv → CCC.csv b/data/CCC.csv → CCC.csv
diff --git a/README.md b/README.md
@@ -1,42 +1,48 @@
-# Toxicity detection w/ and w/o context
-* Concerning comments existing in a thread.
-* Context information: 
-    * The parent comment.
-    * The discussion topic.
-* The large dataset is included in the [data](https://github.com/ipavlopoulos/context_toxicity/tree/master/data) folder in the form of two CSV files.
-    * `gn.csv` comprises the out of context annotations.
-    * `gc.csv` comprises the in-context annotations.
-* The small dataset will be included soon.
-
-### Word embeddings
-* You will need to add a folder `embeddings` when using pre-trained embeddings.
-    * For example, GloVe embeddings.
-
-### Building the datasets
-Create random splits:
->python experiments.py --create_random_splits 10
-
-Downsample the two categories (one per dataset) to make the datasets equibalanced while equally sized:
->python experiments.py --create_balanced_datasets
-
-Then, create 10 random splits:
->python experiments.py --create_random_splits 10 --use_balanced_datasets True
-
-### Running a classifier
-
-Run a simple bi-LSTM by:
-> nohup python experiments.py --with_context_data False --with_context_model "RNN:OOC" --repeat 10 > rnn.ooc.log &
-
-* You can train it also in IC data, by changing the related argument.
-    * If you call "RNN:INC1", the same LSTM will be trained, but another LSTM will encode the parent text (IC data required) and concatenate the two encoded texts before the dense layers on the top.
-    * If you call "BERT:OOC1" you have a simple BERT.
-    * If you call "BERT:OOC2" you concatenate the parent text (IC data required) with a SEPARATED token.
-    * If you call "BERT:CA" you extend BERT:OOC1 with the LSTM encoded parent text, similarly to the RNN:INC1.
-
-The names are messy, but these will hopefully change. 
-
-### The article
-* Presented at ACL'20
-* [Link to arXiv](https://arxiv.org/abs/2006.00998)
-* Please cite:
->@misc{pavlopoulos2020toxicity, title={Toxicity Detection: Does Context Really Matter?}, author={John Pavlopoulos and Jeffrey Sorensen and Lucas Dixon and Nithum Thain and Ion Androutsopoulos}, year={2020}, eprint={2006.00998}, archivePrefix={arXiv}, primaryClass={cs.CL}}
+# The CCC dataset
+
+The article presenting this dataset is [Context Sensitivity Estimation in Toxicity Detection](https://aclanthology.org/2021.woah-1.15/).
+
+To build the dataset of this work, we used the publicly available Civil Comments (CC) dataset (Borkan et al., 2019). 
+CC was originally annotated by ten annotators per post, but the parent post (the previous post in the thread) was not 
+shown to the annotators. 
+ We call this new dataset Civil Comments in Context (CCC). Each CCC post was rated either as NON-TOXIC, UNSURE, TOXIC, or
+VERY TOXIC, as in the original CC dataset.
+We unified the latter two labels in both CC and CCC annotations to simplify the problem. In only 71 posts (0.07%) an annotator said UNSURE, meaning annotators were confident
+in their decisions most of the time. We exclude these 71 posts from our study, as there are too few
+to generalize about.
+
+The dataset is stored as a CSV (CCC.csv), which contains 8 columns:
+
+* `id`: the id of the target post on the civil comments platform 
+* `tox_codes_oc`: the toxic codes given by the annotators whao did not have access to the parent post
+* `text`: the target posts
+* `toxicity_annotator_count`: the number of the annotators who annotated this post
+* `parent`: the parent post
+* `tox_codes_ic`: the toxic codes given by the annotators who did have access to the parent post
+* `tox_codes_parent`: the toxic codes (out of context) of the parent post
+* `workers_ic`: the ids of the annotators on the appen platform
+
+## Previous versions
+* An older version of this dataset was presented at ACL 2020 and it is included in this repository.
+* You can read the respective article [here](https://aclanthology.org/2020.acl-main.396/).
+
+## How to cite this dataset:
+```
+@inproceedings{xenos-etal-2021-context,
+    title = "Context Sensitivity Estimation in Toxicity Detection",
+    author = "Xenos, Alexandros  and
+      Pavlopoulos, John  and
+      Androutsopoulos, Ion",
+    booktitle = "Proceedings of the 5th Workshop on Online Abuse and Harms (WOAH 2021)",
+    month = aug,
+    year = "2021",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.woah-1.15",
+    doi = "10.18653/v1/2021.woah-1.15",
+    pages = "140--145",
+    abstract = "User posts whose perceived toxicity depends on the conversational context are rare in current toxicity detection datasets. Hence, toxicity detectors trained on current datasets will also disregard context, making the detection of context-sensitive toxicity a lot harder when it occurs. We constructed and publicly release a dataset of 10k posts with two kinds of toxicity labels per post, obtained from annotators who considered (i) both the current post and the previous one as context, or (ii) only the current post. We introduce a new task, context-sensitivity estimation, which aims to identify posts whose perceived toxicity changes if the context (previous post) is also considered. Using the new dataset, we show that systems can be developed for this task. Such systems could be used to enhance toxicity detection datasets with more context-dependent posts or to suggest when moderators should consider the parent posts, which may not always be necessary and may introduce additional costs.",
+}
+```
+
+
diff --git a/__init__.py b/__init__.py
diff --git a/data/README.MD b/data/README.MD