-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathselectData.py
More file actions
executable file
·63 lines (56 loc) · 1.89 KB
/
selectData.py
File metadata and controls
executable file
·63 lines (56 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/python -W all
"""
selectData.py: select data for active learning
usage: selectData.py size < file
note: apply to the classfication of the development data (EXPERIMENTS/dev)
20170622 erikt(at)xs4all.nl
"""
import csv
import random
import sys
COMMAND = sys.argv.pop(0)
USAGE = "usage: "+COMMAND+" size < file"
SMALLNUMBER = 0.0001
NBROFFIELDS = 5
if len(sys.argv) <= 0: sys.exit(USAGE)
try: size = int(sys.argv.pop(0))
except: sys.exit(USAGE)
def computeFactor(row):
try: factor =(SMALLNUMBER+float(row[2]))/(SMALLNUMBER+float(row[4]))
except: sys.exit(COMMAND+": problem with scores in "+str(row)+"\n")
return(factor)
# read the tweets with confidence scores: 7,7,0.930,6,0.262,...
csvreader = csv.reader(sys.stdin,delimiter=',',quotechar='"')
nbrOfFields = -1
rows = []
factors = [] # quotient of two largest confidence scores
for row in csvreader:
if nbrOfFields < 0: nbrOfFields = len(row)
if len(row) != nbrOfFields: sys.exit(COMMAND+": unexpected row length: "+str(row)+"\n")
rows.append(row)
factor = computeFactor(row)
factors.append(factor)
# sort factors
factors.sort()
top10 = int(0.10*float(len(factors)))
# select tweets
csvwriter = csv.writer(sys.stdout,delimiter=',',quotechar='"')
for i in range(0,size,2):
# select a difficult tweet
r = int(float(len(rows))*random.random())
factor = computeFactor(rows[r])
while factor > factors[top10]:
r = int(float(len(rows))*random.random())
factor = computeFactor(rows[r])
# now we have a random difficult tweet
csvwriter.writerow(rows[r])
# delete the tweet (draw without replacement)
rows[r] = list(rows[-1])
rows.pop(-1)
# select a difficult tweet
r = int(float(len(rows))*random.random())
# now we have a random tweet
csvwriter.writerow(rows[r])
# delete the tweet (draw without replacement)
rows[r] = list(rows[-1])
rows.pop(-1)