Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions affildb/augmenter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import json

class AffilAugmenter(object):

def __init__(self):
pass


def _build_aff_id(self):
aff_id = []
for auth in self.author_data:
author_affid_string = "; ".join([a.get("inst_id", "-") for a in auth])
aff_id.append(author_affid_string)
self.aff_id = aff_id

def _build_aff_canonical(self):
aff_canonical = []
for auth in self.author_data:
author_canonical_string = "; ".join([a.get("inst_canonical", "-") for a in auth])
aff_canonical.append(author_canonical_string)
self.aff_canonical = aff_canonical

def _build_aff_country(self):
aff_country = []
for auth in self.author_data:
author_country_string = "; ".join([a.get("inst_country", "-") for a in auth])
aff_country.append(author_country_string)
self.aff_country = aff_country

def _build_aff_iso_country(self):
aff_iso_country = []
for auth in self.author_data:
author_iso_country_string = "; ".join([a.get("inst_iso_country", "-") for a in auth])
aff_iso_country.append(author_iso_country_string)
self.aff_iso_country = aff_iso_country

def _build_parents(self):
self.aff_facet_hier = []
self.aff_abbrev = []
for auth in self.author_data:
auth_aff_abbrev = []
for aff in auth:
aff_abbrev = aff.get("inst_abbreviation", "-")
if aff_abbrev == "-":
auth_aff_abbrev.append(aff_abbrev)
else:
aff_parents = aff.get("parent_data", [])
parent_abbrev = [p.get("inst_abbreviation", "-") for p in aff_parents]
if parent_abbrev:
pc_list = ["%s/%s" % (p, aff_abbrev) for p in parent_abbrev]
abbrev_string = "; ".join(pc_list)
for p in parent_abbrev:
facet_0 = "0/%s" % p
facet_1 = "1/%s/%s" % (p, aff_abbrev)
if facet_0 not in self.aff_facet_hier:
self.aff_facet_hier.append(facet_0)
if facet_1 not in self.aff_facet_hier:
self.aff_facet_hier.append(facet_1)
else:
abbrev_string = "%s/%s" % (aff_abbrev, aff_abbrev)
facet_0 = "0/%s" % aff_abbrev
facet_1 = "1/%s/%s" % (aff_abbrev, aff_abbrev)
if facet_0 not in self.aff_facet_hier:
self.aff_facet_hier.append(facet_0)
if facet_1 not in self.aff_facet_hier:
self.aff_facet_hier.append(facet_1)
auth_aff_abbrev.append(abbrev_string)
self.aff_abbrev.append("; ".join(auth_aff_abbrev))


def _build_output(self):
self.aff = self.record.get("aff", [])
self._build_aff_canonical()
self._build_aff_country()
self._build_aff_iso_country()
self._build_aff_id()
self._build_parents()
self.author = self.record.get("author", [])
self.bibcode = self.record.get("bibcode", "")
self.scixID = self.record.get("scixID", "")
self.output = {
"aff": self.aff,
"aff_abbrev": self.aff_abbrev,
"aff_country": self.aff_country,
"aff_canonical": self.aff_canonical,
"aff_facet_hier": self.aff_facet_hier,
"aff_id": self.aff_id,
"aff_iso_country": self.aff_iso_country,
"author": self.author,
"bibcode": self.bibcode,
"scix_id": self.scixID
}

def parse(self, record, author_data):
self.record = record
self.author_data = author_data
self._build_output()
return self.output
71 changes: 63 additions & 8 deletions affildb/database.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os

from adsputils import load_config, setup_logging
from sqlalchemy import func

from affildb.models import AffilInst as affil_inst
from affildb.models import AffilData as affil_data
from affildb.augmenter import AffilAugmenter as aa

proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
config = load_config(proj_home=proj_home)
Expand All @@ -25,7 +26,7 @@ class DBWriteException(Exception):
class DBQueryException(Exception):
pass

# general use functions,
# db management functions,
def clear_table(app, table):
with app.session_scope() as session:
try:
Expand All @@ -46,19 +47,73 @@ def write_block_to_table(app, table, datablock):
session.flush()
raise DBWriteException("Failed to bulk write data block: %s" % err)

def query_one_string(app, table, query_string):
def fetch_data_table(app, table):
with app.session_scope() as session:
try:
return session.query(table.affil_id).filter_by(affil_string=query_string).all()
results = session.query(table).all()
raw_data = []
for row in results:
raw_data.append([row.affil_id, row.affil_string])
return raw_data
except Exception as err:
raise DBQueryException("Unable to query %s for %s: %s" % (str(table), query_string, err))


def fetch_full_table(app, table):
def query_distinct_norm(app, table):
with app.session_scope() as session:
try:
return session.query(table).all()
return session.query(table).distinct(table.norm_string).all()
except Exception as err:
raise DBQueryException("Unable to query %s for %s: %s" % (str(table), query_string, err))
raise DBQueryException("Unable to query %s for distinct normalized strings: %s" % (str(table), err))


# augment pipeline functions
def query_one_string(app, query_string, norm):
with app.session_scope() as session:
outputDefault = {}
try:
inst_id = None
if norm:
inst_id = session.query(affil_inst).join(affil_data, affil_data.affil_id == affil_inst.inst_id).filter(affil_data.norm_string==query_string).first()
else:
inst_id = session.query(affil_inst).join(affil_data, affil_data.affil_id == affil_inst.inst_id).filter(affil_data.affil_string==query_string).first()

if not inst_id:
return outputDefault

else:
child_data = inst_id.toJSON()
parent_str = child_data.get("inst_parents", "")
parent_data = []
if parent_str:
parent_id_list = [x.strip() for x in parent_str.split(";")]
for p in parent_id_list:
try:
pdata = session.query(affil_inst).filter(affil_inst.inst_id==p).first().toJSON()
parent_data.append(pdata)
except Exception as err:
print("This shouldn't happen: %s" % err)
child_data["parent_data"] = parent_data
return child_data

except Exception as err:
raise DBQueryException("Unable to query %s for %s: %s" % (str(affil_data), query_string, err))


def augment_record(app, record, norm):
try:
author_data = []
for auth in record.get("aff", []):
alist = auth.split(";")
author_aff = []
for a in alist:
author_aff.append(query_one_string(app, a.strip(), norm))
author_data.append(author_aff)
augment_affil = aa().parse(author_data)
return augment_affil
except Exception as err:
print("Welp... %s" % err)


# output is...
#{'inst_country': 'USA', 'inst_parents': 'A00976', 'inst_id': 'A00977', 'inst_abbreviation': 'Bartol Res Inst', 'inst_canonical': 'University of Delaware, Bartol Research Institute', 'error': '', 'parent_data': [{'inst_country': 'USA', 'inst_parents': '', 'inst_id': 'A00976', 'inst_abbreviation': 'U Delaware', 'inst_canonical': 'University of Delaware', 'error': ''}]}
93 changes: 61 additions & 32 deletions affildb/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,56 +3,85 @@
except ImportError:
from adsmutils import get_date, UTCDateTime

from sqlalchemy import Column, Integer, String, Text
from sqlalchemy import Column, Integer, String, Text, Boolean, Index
from sqlalchemy.dialects.postgresql import ARRAY
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()


class AffilData(Base):
"""
affil_data holds the mapping of published string and affiliation ID
"""

__tablename__ = "affil_data"

data_key = Column(Integer, primary_key=True, unique=True)
affil_id = Column(String(6), nullable=False)
affil_string = Column(Text, unique=True, nullable=False)
created = Column(UTCDateTime, default=get_date)
updated = Column(UTCDateTime, onupdate=get_date)


class AffilNorm(Base):
__tablename__ = "affil_norm"

norm_key = Column(Integer, primary_key=True, unique=True)
affil_id = Column(String(6), unique=False, nullable=False)
affil_string = Column(Text, unique=True, nullable=False)


class AffilInst(Base):
__tablename__ = "affil_inst"

inst_key = Column(Integer, primary_key=True, unique=True)
inst_id = Column(String(6), unique=True, nullable=False)
inst_key = Column(Integer, primary_key=True, autoincrement=True, unique=True)
inst_id = Column(String(6), primary_key=True, unique=True, nullable=False)
inst_parents = Column(String, nullable=True)
inst_canonical = Column(String, nullable=False)
inst_abbreviation = Column(String, nullable=False)
inst_country = Column(String, nullable=True)
inst_country = Column(String, nullable=False)
inst_iso_country = Column(String, nullable=False)
# in place of location, we could consider using GeoAlchemy2 here
# especially if we can get lat-lon from ROR
inst_location = Column(String, nullable=True)
inst_rorid = Column(String, nullable=True)
inst_notes = Column(Text, nullable=True)
created = Column(UTCDateTime, default=get_date)
updated = Column(UTCDateTime, onupdate=get_date)

def toJSON(self):
try:
outputJson = {"inst_iso_country": self.inst_iso_country,
"inst_country": self.inst_iso_country,
"inst_parents": self.inst_parents,
"inst_id": self.inst_id,
"inst_abbreviation": self.inst_abbreviation,
"inst_canonical": self.inst_canonical,
"error": ""}
return outputJson
except Exception as err:
return {"error": err}

def toRow(rowdat):
if len(rowdat) == 6:
return {"inst_iso_country": rowdat[0],
"inst_country": rowdat[1],
"inst_parents": rowdat[2],
"inst_id": rowdat[3],
"inst_abbreviation": rowdat[4],
"inst_canonical": rowdat[5]}
else:
return {}

class AffilData(Base):
"""
affil_data holds the mapping of published string and affiliation ID
"""

class AffilCuration(Base):
__tablename__ = "affil_curation"
__tablename__ = "affil_data"

curation_key = Column(Integer, primary_key=True, unique=True)
curation_count = Column(Integer, nullable=True)
affil_id = Column(String(6), unique=False, nullable=True)
data_key = Column(Integer, primary_key=True, autoincrement=True, unique=True)
affil_id = Column(String(6), primary_key=True, unique=False, nullable=False)
affil_string = Column(Text, unique=True, nullable=False)
norm_string = Column(Text, unique=False, nullable=False)
norm_string = Column(Text, primary_key=True, unique=False, nullable=False)
flagged = Column(Boolean, default=False, nullable=False)
created = Column(UTCDateTime, default=get_date, nullable=False)
updated = Column(UTCDateTime, onupdate=get_date, nullable=False)

__table_args__ = (Index('norm_index', norm_string, postgresql_using="hash"),)

def toJSON(self):
try:
outputJSON = {"affil_id": self.affil_id,
"affil_string": self.affil_string,
"norm_string": self.norm_string}
return outputJSON
except Exception as err:
return {"error": err}

def toRow(rowdat):
if len(rowdat) == 3:
return {"affil_id": rowdat[0],
"affil_string": rowdat[1],
"norm_string": rowdat[2]}
else:
return {}
Loading