-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze.py
More file actions
127 lines (111 loc) · 4.25 KB
/
Copy pathanalyze.py
File metadata and controls
127 lines (111 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
'''
Name: analyze.py
Author: Jennifer Hu
Purpose: Compute and minimize entropy for the elicited productions.
'''
import argparse
from utils import *
import spacy
import pandas as pd
from numpy import mean, std
##################################################################################
## HELPER FUNCTIONS
##################################################################################
def get_token_stats(df, stim, nlp):
'''
Returns mean and standard deviation of distribution of number of tokens
across responses for a particular stimulus.
'''
# get productions associated with particular stimulus
productions = df.loc[df.img == stim].production
# parse and clean
docs = nlp.pipe(productions, batch_size=50, n_threads=4)
texts = [[token.text for token in doc] for doc in docs]
texts = [clean(token_list) for token_list in texts]
# calculate number of tokens
num_tokens = [len(token_list) for token_list in texts]
return mean(num_tokens), std(num_tokens)
def mean_std_lengths(df, nlp):
'''
Returns mean and standard deviation of production lengths (in tokens)
for each image stimulus.
'''
# get list of stimuli
stims = df.img.unique()
# get means and stds
stats = [get_token_stats(df, stim, nlp) for stim in stims]
means, stds = unzip(stats)
return means, stds
def num_unique(df, stim, val, lemma=False, nlp=None):
'''
Returns number of unique responses or lemmas for a given stimulus and
column name (val).
'''
# get rows corresponding to stim and column corresponding to val
texts = df.loc[df.img == stim, val].values
# clean texts (remove punctuation, capitalization)
texts = clean(texts)
if lemma:
# count number of unique lemmas
docs = nlp.pipe(texts, batch_size=50, n_threads=4)
lemmas = [token.lemma_ for doc in docs for token in doc]
return num_unique_elts(lemmas)
else:
# count number of unique responses
return num_unique_elts(texts)
def count_unique(df, nlp):
'''
Returns dict with number of unique responses for the values in df,
as well as the number of unique lemmas for the single-word columns
(vp_head and subject_np_head).
'''
# get list of stimuli
stims = df.img.unique()
# list of column names to calculate over
vals = [c for c in list(df) if c not in ['img', 'production', 'hit']]
# get number of unique responses
data = { val : [num_unique(df, stim, val) for stim in stims]
for val in vals }
# get number of unique lemmas for vp and np heads
data['vp_head_lemmas'] = [
num_unique(df, stim, 'vp_head', lemma=True, nlp=nlp) for stim in stims
]
data['subject_np_head_lemmas'] = [
num_unique(df, stim, 'subject_np_head', lemma=True, nlp=nlp)
for stim in stims
]
data['img'] = stims
return data
def minimize(df, nlp):
unique = count_unique(df, nlp)
_, stds = mean_std_lengths(df, nlp)
unique['std_num_tokens'] = stds
# calculate and sort by sum of entire row
df = pd.DataFrame(unique)
df['sum'] = df.sum(numeric_only=True, axis=1)
sorted_df = df.sort_values(by=['sum'])
return sorted_df
##################################################################################
## MAIN
##################################################################################
def main(data, out, overwrite):
if data == out and not overwrite:
raise NameError('Pass the -O flag to allow overwriting.')
else:
nlp = spacy.load('en')
print('Reading data from {}...'.format(data))
data_df = pd.read_csv(data)
print('Minimizing entropy...')
sorted = minimize(data_df, nlp)
sorted.to_csv(out, index=False)
print('Saved to {}'.format(out))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str,
help='path to data file (expects .csv)')
parser.add_argument('--out', type=str,
help='path to write output file')
parser.add_argument('--overwrite', '-O', action='store_true', default=False,
help='flag to overwrite data file if data==out')
args = parser.parse_args()
main(**vars(args))