Skip to content

Commit e78e897

Browse files
committed
Refactor type handling in invoices
Previously, there was no consistent method to create new invoice columns with an explicit type. This has been the source of some confusion when debugging and writing test cases. As such, several changes have been added to reduce confusion over the typing of invoice columns: - A function (`_create_column`) has been added to `invoice.py` to provide a consistent way of creating new columns with an explicit type. - Many fields will now use Pandas' built-in extension types [1], such as `BooleanDtype`, which allows them to be nullable. This also enforces typing, as assigning incompatible values to these columns will raise a type error - Decimal precision has been increased to 21. This is to allow `int64` values to be converted to `pyarrow.decimal128(21, 2)`. This is useful for writing test cases (no longer have to write `[1.0, 2.0]`), and in case our invoices have balance fields that resemble integers - A new test case base class is made with a function to create a dataframe with standardized column types. This allows creating invoice dataframes that more closely resemble what our pipeline will see during execution - Removed redundant code in `_get_test_invoice()` of many test classes - `PIInvoice` will cast the dataframe to `StringDtype` to prevent TypeError with the new Pandas extension types [1] https://pandas.pydata.org/docs/user_guide/basics.html#dtypes
1 parent 54019b2 commit e78e897

19 files changed

+201
-132
lines changed

process_report/invoices/invoice.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from dataclasses import dataclass
22
import pandas
3+
import pyarrow
34

45
import process_report.util as util
56

@@ -58,6 +59,12 @@
5859
CLUSTER_NAME_FIELD = "Cluster Name"
5960
###
6061

62+
### Invoice column types
63+
BALANCE_FIELD_TYPE = pandas.ArrowDtype(pyarrow.decimal128(21, 2))
64+
STRING_FIELD_TYPE = pandas.StringDtype()
65+
BOOL_FIELD_TYPE = pandas.BooleanDtype()
66+
###
67+
6168

6269
@dataclass
6370
class Invoice:
@@ -110,6 +117,11 @@ def _prepare_export(self):
110117
that should or should not be exported after processing."""
111118
pass
112119

120+
def _create_column(self, column_name, column_type, value):
121+
"""Creates a new column in `self.data` with type and value."""
122+
self.data[column_name] = value
123+
self.data[column_name] = self.data[column_name].astype(column_type)
124+
113125
def _filter_columns(self):
114126
"""Filters and renames columns before exporting"""
115127
self.export_data = self.export_data[self.export_columns_list].rename(

process_report/invoices/pi_specific_invoice.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def _get_pi_dataframe(self, data, pi):
105105
lambda data: data if pandas.isna(data) else f"${data}"
106106
)
107107

108+
pi_projects = pi_projects.astype(invoice.STRING_FIELD_TYPE)
108109
pi_projects.fillna("", inplace=True)
109110

110111
return pi_projects

process_report/process_report.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66
import os
77

88
import pandas
9-
import pyarrow
109
from nerc_rates import load_from_url
1110

1211
from process_report import util
1312
from process_report.invoices import (
13+
invoice,
1414
lenovo_invoice,
1515
nonbillable_invoice,
1616
billable_invoice,
@@ -420,8 +420,8 @@ def merge_csv(files):
420420
dataframe = pandas.read_csv(
421421
file,
422422
dtype={
423-
COST_FIELD: pandas.ArrowDtype(pyarrow.decimal128(12, 2)),
424-
RATE_FIELD: str,
423+
COST_FIELD: invoice.BALANCE_FIELD_TYPE,
424+
RATE_FIELD: invoice.STRING_FIELD_TYPE,
425425
},
426426
)
427427
dataframes.append(dataframe)

process_report/processors/bu_subsidy_processor.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,16 @@ def get_project(row):
1919
else:
2020
return project_alloc[: project_alloc.rfind("-")]
2121

22-
self.data[invoice.PROJECT_NAME_FIELD] = self.data.apply(get_project, axis=1)
22+
self._create_column(
23+
invoice.PROJECT_NAME_FIELD,
24+
invoice.STRING_FIELD_TYPE,
25+
self.data.apply(get_project, axis=1),
26+
)
27+
self._create_column(
28+
invoice.SUBSIDY_FIELD,
29+
invoice.BALANCE_FIELD_TYPE,
30+
0,
31+
)
2332
self.data[invoice.SUBSIDY_FIELD] = Decimal(0)
2433

2534
def _process(self):

process_report/processors/lenovo_processor.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@ class LenovoProcessor(processor.Processor):
1010
SU_CHARGE_MULTIPLIER = 1
1111

1212
def _process(self):
13-
self.data[invoice.SU_CHARGE_FIELD] = self.SU_CHARGE_MULTIPLIER
14-
self.data[invoice.LENOVO_CHARGE_FIELD] = (
15-
self.data[invoice.SU_HOURS_FIELD] * self.data[invoice.SU_CHARGE_FIELD]
13+
self._create_column(
14+
invoice.SU_CHARGE_FIELD,
15+
int,
16+
self.SU_CHARGE_MULTIPLIER,
17+
)
18+
self._create_column(
19+
invoice.LENOVO_CHARGE_FIELD,
20+
invoice.BALANCE_FIELD_TYPE,
21+
self.data[invoice.SU_HOURS_FIELD] * self.data[invoice.SU_CHARGE_FIELD],
1622
)

process_report/processors/new_pi_credit_processor.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,26 @@ def get_initial_credit_amount(
183183
return (data, old_pi_df)
184184

185185
def _prepare(self):
186-
self.data[invoice.CREDIT_FIELD] = None
187-
self.data[invoice.CREDIT_CODE_FIELD] = None
188-
self.data[invoice.PI_BALANCE_FIELD] = self.data[invoice.COST_FIELD]
189-
self.data[invoice.BALANCE_FIELD] = self.data[invoice.COST_FIELD]
186+
self._create_column(
187+
invoice.CREDIT_FIELD,
188+
invoice.BALANCE_FIELD_TYPE,
189+
None,
190+
)
191+
self._create_column(
192+
invoice.CREDIT_CODE_FIELD,
193+
invoice.STRING_FIELD_TYPE,
194+
None,
195+
)
196+
self._create_column(
197+
invoice.PI_BALANCE_FIELD,
198+
invoice.BALANCE_FIELD_TYPE,
199+
self.data[invoice.COST_FIELD],
200+
)
201+
self._create_column(
202+
invoice.BALANCE_FIELD,
203+
invoice.BALANCE_FIELD_TYPE,
204+
self.data[invoice.COST_FIELD],
205+
)
190206
self.old_pi_df = self._load_old_pis(self.old_pi_filepath)
191207

192208
def _process(self):

process_report/processors/prepayment_processor.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,31 @@ def _load_prepay_debits(prepay_debits_filepath):
3838
return prepay_debits
3939

4040
def _prepare(self):
41-
self.data[invoice.GROUP_NAME_FIELD] = None
42-
self.data[invoice.GROUP_INSTITUTION_FIELD] = None
43-
self.data[invoice.GROUP_MANAGED_FIELD] = None
44-
self.data[invoice.GROUP_BALANCE_FIELD] = None
45-
self.data[invoice.GROUP_BALANCE_USED_FIELD] = None
41+
self._create_column(
42+
invoice.GROUP_NAME_FIELD,
43+
invoice.STRING_FIELD_TYPE,
44+
None,
45+
)
46+
self._create_column(
47+
invoice.GROUP_INSTITUTION_FIELD,
48+
invoice.STRING_FIELD_TYPE,
49+
None,
50+
)
51+
self._create_column(
52+
invoice.GROUP_MANAGED_FIELD,
53+
invoice.BOOL_FIELD_TYPE,
54+
None,
55+
)
56+
self._create_column(
57+
invoice.GROUP_BALANCE_FIELD,
58+
invoice.BALANCE_FIELD_TYPE,
59+
None,
60+
)
61+
self._create_column(
62+
invoice.GROUP_BALANCE_USED_FIELD,
63+
invoice.BALANCE_FIELD_TYPE,
64+
None,
65+
)
4666

4767
self.prepay_debits = self._load_prepay_debits(self.prepay_debits_filepath)
4868
self.group_info_dict = self._get_prepay_group_dict()

process_report/processors/validate_billable_pi_processor.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,15 @@ def _str_to_lowercase(data):
5656
)
5757

5858
def _process(self):
59-
self.data[invoice.IS_BILLABLE_FIELD] = self._get_billables(
60-
self.data, self.nonbillable_pis, self.nonbillable_projects
59+
self._create_column(
60+
invoice.IS_BILLABLE_FIELD,
61+
invoice.BOOL_FIELD_TYPE,
62+
self._get_billables(
63+
self.data, self.nonbillable_pis, self.nonbillable_projects
64+
),
65+
)
66+
self._create_column(
67+
invoice.MISSING_PI_FIELD,
68+
invoice.BOOL_FIELD_TYPE,
69+
self._validate_pi_names(self.data),
6170
)
62-
self.data[invoice.MISSING_PI_FIELD] = self._validate_pi_names(self.data)

process_report/tests/base.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,48 @@
33
from pathlib import Path
44
from unittest import TestCase
55

6+
import pandas
7+
import pyarrow
68

7-
class BaseTestCaseWithTempDir(TestCase):
9+
10+
### Invoice column types
11+
BOOL_FIELD_TYPE = pandas.BooleanDtype()
12+
STRING_FIELD_TYPE = pandas.StringDtype()
13+
BALANCE_FIELD_TYPE = pandas.ArrowDtype(pyarrow.decimal128(21, 2))
14+
###
15+
16+
17+
class BaseTestCase(TestCase):
18+
field_type_mapping = {
19+
"Is Billable": BOOL_FIELD_TYPE,
20+
"Missing PI": BOOL_FIELD_TYPE,
21+
"Rate": STRING_FIELD_TYPE,
22+
"Cost": BALANCE_FIELD_TYPE,
23+
"Credit": BALANCE_FIELD_TYPE,
24+
"Subsidy": BALANCE_FIELD_TYPE,
25+
"Balance": BALANCE_FIELD_TYPE,
26+
"Prepaid Group Managed": BOOL_FIELD_TYPE,
27+
"Prepaid Group Balance": BALANCE_FIELD_TYPE,
28+
"Prepaid Group Used": BALANCE_FIELD_TYPE,
29+
}
30+
31+
def _create_test_invoice(self, data: dict) -> pandas.DataFrame:
32+
"""
33+
Given a dictionary of data, create a DataFrame that represents an invoice.
34+
Also standardizes the type for some columns mostly those that expect monetary
35+
values, like BALANCE_FIELD, COST_FIELD, etc. These columns are usually
36+
converted to pyarrow.decimal128(21, 2)
37+
"""
38+
standard_dtypes = {}
39+
for column_name in data.keys():
40+
if column_name in self.field_type_mapping:
41+
standard_dtypes[column_name] = self.field_type_mapping[column_name]
42+
43+
test_invoice = pandas.DataFrame(data)
44+
return test_invoice.astype(standard_dtypes)
45+
46+
47+
class BaseTestCaseWithTempDir(BaseTestCase):
848
def setUp(self):
949
self.tempdir = Path(tempfile.TemporaryDirectory(delete=False).name)
1050

process_report/tests/unit/invoices/test_base_invoice.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from unittest import TestCase, mock
22
import pandas
33

4-
from process_report.tests import util as test_utils
4+
from process_report.tests import base, util as test_utils
55

66

7-
class TestBaseInvoice(TestCase):
7+
class TestBaseInvoice(base.BaseTestCase):
88
def test_filter_exported_columns(self):
99
test_invoice = pandas.DataFrame(columns=["C1", "C2", "C3", "C4", "C5"])
1010
answer_invoice = pandas.DataFrame(columns=["C1", "C3R", "C5R"])
@@ -17,6 +17,20 @@ def test_filter_exported_columns(self):
1717

1818
self.assertTrue(result_invoice.equals(answer_invoice))
1919

20+
def test_create_column(self):
21+
test_invoice = self._create_test_invoice({"Col1": [1, 2], "Col2": [3, 4]})
22+
answer_invoice = test_invoice.copy()
23+
answer_invoice["Col3"] = "default"
24+
answer_invoice["Col4"] = 10.0
25+
answer_invoice["Col3"] = answer_invoice["Col3"].astype(base.STRING_FIELD_TYPE)
26+
answer_invoice["Col4"] = answer_invoice["Col4"].astype(base.BALANCE_FIELD_TYPE)
27+
28+
inv = test_utils.new_base_invoice(data=test_invoice)
29+
inv._create_column("Col3", base.STRING_FIELD_TYPE, "default")
30+
inv._create_column("Col4", base.BALANCE_FIELD_TYPE, 10.0)
31+
32+
self.assertTrue(inv.data.equals(answer_invoice))
33+
2034

2135
class TestUploadToS3(TestCase):
2236
@mock.patch("process_report.util.get_invoice_bucket")

0 commit comments

Comments
 (0)