-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathschema.sql
More file actions
224 lines (199 loc) · 7.6 KB
/
Copy pathschema.sql
File metadata and controls
224 lines (199 loc) · 7.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
-- Spatial extensions
CREATE EXTENSION postgis;
CREATE EXTENSION postgis_topology;
-- Create Trigger Function to update all_reports table
CREATE OR REPLACE FUNCTION public.update_all_reports_from_tweets()
RETURNS trigger AS
$BODY$
BEGIN
IF (TG_OP = 'UPDATE') THEN
INSERT INTO all_reports (fkey, created_at, text, source, lang, url, the_geom) SELECT NEW.pkey, NEW.created_at, NEW.text, 'twitter', NEW.lang, NEW.url, NEW.the_geom;
RETURN NEW;
ELSIF (TG_OP = 'INSERT') THEN
INSERT INTO all_reports (fkey, created_at, text, source, lang, url, the_geom) SELECT NEW.pkey, NEW.created_at, NEW.text, 'twitter', NEW.lang, NEW.url, NEW.the_geom;
RETURN NEW;
END IF;
END;
$BODY$
LANGUAGE plpgsql VOLATILE
COST 100;
ALTER FUNCTION public.update_all_reports_from_tweets()
OWNER TO postgres;
-- Table: tweet_reports
-- DROP TABLE tweet_reports;
-- Create table for Twitter reports
CREATE TABLE tweet_reports
(
pkey bigserial NOT NULL,
database_time timestamp with time zone DEFAULT now(),
created_at timestamp with time zone,
text character varying,
hashtags json,
text_urls character varying,
user_mentions json,
lang character varying,
url character varying,
tweet_id bigint,
CONSTRAINT pkey_tweets PRIMARY KEY (pkey)
)
WITH (
OIDS=FALSE
);
ALTER TABLE tweet_reports
OWNER TO postgres;
-- Add Geometry column to tweet_reports
SELECT AddGeometryColumn ('public','tweet_reports','the_geom',4326,'POINT',2);
-- Add GIST spatial index
CREATE INDEX gix_tweet_reports ON tweet_reports USING gist (the_geom);
-- Update all_reports table
CREATE TRIGGER trigger_update_all_reports_from_tweets
BEFORE INSERT OR UPDATE
ON public.tweet_reports
FOR EACH ROW
EXECUTE PROCEDURE public.update_all_reports_from_tweets();
-- Create table for Twitter report users
CREATE TABLE tweet_users
(
pkey bigserial,
user_hash character varying UNIQUE,
reports_count integer ,
CONSTRAINT pkey_tweet_users PRIMARY KEY (pkey)
)
WITH (
OIDS=FALSE
);
ALTER TABLE tweet_users
OWNER TO postgres;
-- Create table for non spatial tweets
CREATE TABLE nonspatial_tweet_reports
(
pkey bigserial NOT NULL,
database_time timestamp with time zone DEFAULT now(),
created_at timestamp with time zone,
text character varying,
hashtags json,
urls character varying,
user_mentions json,
lang character varying,
CONSTRAINT pkey_nonspatial_tweets PRIMARY KEY (pkey)
)
WITH (
OIDS=FALSE
);
ALTER TABLE nonspatial_tweet_reports
OWNER TO postgres;
-- Create table for unconfirmed tweet reports
CREATE TABLE tweet_reports_unconfirmed
(
pkey bigserial,
database_time timestamp with time zone DEFAULT now(),
created_at timestamp with time zone,
CONSTRAINT pkey_tweet_reports_unconfirmed PRIMARY KEY (pkey)
);
-- Add Geometry column to tweet_reports
SELECT AddGeometryColumn ('public','tweet_reports_unconfirmed','the_geom',4326,'POINT',2);
-- Creat Gist spatial index on unconfirmed reports
CREATE INDEX gix_tweet_reports_unconfirmed ON tweet_reports_unconfirmed USING gist (the_geom);
-- Tweet invitees
CREATE TABLE tweet_invitees
(
pkey bigserial,
user_hash character varying UNIQUE,
CONSTRAINT pkey_tweet_invitees PRIMARY KEY (pkey)
);
-- Create table for Twitter users with reports without geospatial metadata
CREATE TABLE nonspatial_tweet_users
(
pkey bigserial,
user_hash character varying UNIQUE,
CONSTRAINT nonspatial_tweet_users_pkey PRIMARY KEY (pkey)
);
-- Create a consolidated table for all users, regardless of data source
CREATE TABLE tweet_all_users
(
pkey bigserial,
user_hash character varying UNIQUE,
CONSTRAINT tweet_all_users_pkey PRIMARY KEY (pkey)
);
CREATE unique INDEX tweet_all_users_index ON tweet_all_users(user_hash);
-- Create a function to update tweet_all_users. Should be called by each data source table (e.g. tweet_reports)
CREATE OR REPLACE FUNCTION update_tweet_all_users()
RETURNS trigger AS $update_tweet_all_users$
BEGIN
INSERT INTO tweet_all_users(user_hash) SELECT NEW.user_hash WHERE NOT EXISTS (SELECT user_hash FROM tweet_all_users WHERE user_hash = NEW.user_hash);
RETURN NEW;
END;
$update_tweet_all_users$ LANGUAGE plpgsql;
CREATE TRIGGER non_spatial_all_users BEFORE INSERT OR UPDATE ON nonspatial_tweet_users
FOR EACH ROW EXECUTE PROCEDURE update_tweet_all_users();
CREATE TRIGGER tweet_invitees_all_users BEFORE INSERT OR UPDATE ON tweet_invitees
FOR EACH ROW EXECUTE PROCEDURE update_tweet_all_users();
CREATE TRIGGER tweet_users_all_users BEFORE INSERT OR UPDATE ON tweet_users
FOR EACH ROW EXECUTE PROCEDURE update_tweet_all_users();
--Function to update or insert tweet users
CREATE FUNCTION upsert_tweet_users(hash varchar) RETURNS VOID AS
$$
BEGIN
LOOP
-- first try to update the key
UPDATE tweet_users SET reports_count = reports_count + 1 WHERE user_hash = hash;
IF found THEN
RETURN;
END IF;
-- not there, so try to insert the key
-- if someone else inserts the same key concurrently,
-- we could get a unique-key failure
BEGIN
INSERT INTO tweet_users(user_hash,reports_count) VALUES (hash, 1);
RETURN;
EXCEPTION WHEN unique_violation THEN
-- Do nothing, and loop to try the UPDATE again.
END;
END LOOP;
END;
$$
LANGUAGE plpgsql;
-- Create Table to store reports
CREATE TABLE all_reports
(
pkey bigserial NOT NULL,
fkey bigint NOT NULL,
database_time timestamp with time zone DEFAULT now(),
created_at timestamp with time zone,
text character varying NOT NULL,
source character varying NOT NULL,
status character varying DEFAULT 'confirmed',
lang character varying,
url character varying,
image_url character varying,
title character varying,
CONSTRAINT all_tweets PRIMARY KEY (pkey)
)
WITH (
OIDS=FALSE
);
ALTER TABLE all_reports
OWNER TO postgres;
-- Create table to store id of last seen tweet id as captured using GNIP
CREATE TABLE seen_tweet_id (onerow_id bool PRIMARY KEY DEFAULT TRUE, id bigint, CONSTRAINT onerow_uni CHECK (onerow_ID));
INSERT INTO seen_tweet_id VALUES (TRUE, 0);
-- Add Geometry column to tweet_reports
SELECT AddGeometryColumn ('public','all_reports','the_geom',4326,'POINT',2);
ALTER TABLE all_reports ALTER COLUMN the_geom SET NOT NULL;
-- Add GIST spatial index
CREATE INDEX gix_all_reports ON all_reports USING gist (the_geom);
-- Document the table
COMMENT ON TABLE all_reports IS 'Reports from all input data sources';
COMMENT ON COLUMN all_reports.pkey IS '{bigserial} [Primary Key] Unique key for each report';
COMMENT ON COLUMN all_reports.fkey IS '{bigint} [Foreign Key] Unique key from source data table';
COMMENT ON COLUMN all_reports.database_time IS '{timestamp with timezone} Time report written to table';
COMMENT ON COLUMN all_reports.created_at IS '{timestamp with timezone} Time of report as recorded at data source';
COMMENT ON COLUMN all_reports.text IS '{character varying} The text of the report';
COMMENT ON COLUMN all_reports.source IS '{character varying} Data source of the report';
COMMENT ON COLUMN all_reports.status IS '{character varying} Status of the report (defaults to confirmed)';
COMMENT ON COLUMN all_reports.lang IS '{character varying | NULL} Language of report text in all_reports.text';
COMMENT ON COLUMN all_reports.url IS '{character varying | NULL} URL link to report data source';
COMMENT ON COLUMN all_reports.image_url IS '{character varying | NULL} URL link to report image';
COMMENT ON COLUMN all_reports.title IS '{character varying | NULL} Short description of report';
COMMENT ON COLUMN all_reports.the_geom IS '{geometry object} Point location for report using the WGS 1984 coordinate reference system';
COMMENT ON INDEX gix_tweet_reports IS 'Generalized Search Tree Index on all_reports.the_geom';