Skip to content
Open
299 changes: 129 additions & 170 deletions parser/extract/asset.go
Original file line number Diff line number Diff line change
@@ -1,197 +1,156 @@
package extract

import (
"bufio"
"github.com/pkg/errors"
"strconv"
"fmt"
"strings"

"github.com/InstIDEA/ddjj/parser/declaration"
)

var totalAssets int64

var assetsItemNumber int

var skipAssets = []string{
"#",
"DESCRIPCIÓN",
"EMPRESA",
"RUC",
"PAÍS",
"CANT.",
"PRECIO UNI.",
"IMPORTE",
}

// Assets returns other assets owned by the official.
func Assets(scanner *bufio.Scanner) ([]*declaration.OtherAsset, error) {
scanner = MoveUntil(scanner, "1.9 OTROS ACTIVOS", true)

// Also wants to skip item number
assetsItemNumber = 1
skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber))

var assets []*declaration.OtherAsset

values, nextPage := getAssetValues(scanner, 0, false)
for values[0] != "" {
asset := getAsset(scanner, values)
assets = append(assets, asset...)

if nextPage {
assetsItemNumber = 1
} else {
assetsItemNumber++
func Assets(e *Extractor, parser *ParserData) ([]*declaration.OtherAsset, error) {
var assets []*declaration.OtherAsset //lsit of extracted assets
asset := &declaration.OtherAsset{} //aux for the actual extraction
e.BindFlag(EXTRACTOR_FLAG_1) //remueve las lineas en blanco
e.BindFlag(EXTRACTOR_FLAG_2) //remueve los espacios en los extremos
//EXTRACTOR_FLAG_3 crea nuevos tokens siempre que dentro de la linea haya mas o igual a 3 espacios
var bandera bool
bandera = false
counter := 0
successful := 0
if e.MoveUntilStartWith(CurrToken, "1.9 OTROS ACTIVOS") {
for e.Scan() {
// other assets table header and OBS are omitted
if isAssetFormField(e.CurrToken) {
bandera = true //we are in the table records because we have the header
continue
}
if strings.Contains(e.CurrToken, "OBS:") && bandera {
counter++
continue
}
// final of others assets of current page
if strings.Contains(e.CurrToken, "TOTAL OTROS ACTIVOS") {
bandera = false
}
//if the ban it's true, we can proceed with the extraction
if bandera {
values := tokenize(e.CurrToken, 3)
//case 1: Description is in two lines
//in this case the lines are
//descPart1
//number of the register
//descPart2
//rest of row
if len(values) == 1 && isNumber(e.CurrToken) {
description := e.PrevToken + " " + e.NextToken
// moving the current token to the next part
e.Scan()
e.Scan()

//building the struct of other assets
fixed := []string{"#", description}
values = append(fixed, tokenize(e.CurrToken, 3)...)
} else
//case 2: Enterprise name is in two lines
//in this case the lines are
//enterprisePart1
//number of the register + description
//enterprisePart2
//rest of row
if len(values) == 2 {
enterpriseNamePart1 := e.PrevToken
//extracting the description of the currentToken thats saved on values array
description := values[1]
e.Scan() // we need to save the description in this part
allName := enterpriseNamePart1 + " " + e.CurrToken
//moving to the rest of the row
e.Scan()

//building the struct of other assets
fixed := []string{"#", description, allName}
values = append(fixed, tokenize(e.CurrToken, 3)...)

} else
//case 3: country in two lines
//namePart1
//num + description + enterprise + ruc
//namePart2
//cant + price + total
if len(values) == 4 {
country := e.PrevToken + " " + e.NextToken
description := values[1]
enterprise := values[2]
ruc := values[3]
// moving the current token to the next part
e.Scan()
e.Scan()

//building the struct of other assets
fixed := []string{"#", description, enterprise, ruc, country}
values = append(fixed, tokenize(e.CurrToken, 4)...)
}

if len(values) == 8 {
asset = getAsset(values)
assets = append(assets, asset)
}
}
}
// Also wants to skip item number
skipAssets[len(skipAssets)-1] = strconv.Itoa(assetsItemNumber)

values, nextPage = getAssetValues(scanner, 0, false)
successful = len(assets)
}

total := addAssets(assets)
if total == 0 {
return nil, errors.New("failed when extracting other assets")
if successful != counter {
parser.addMessage(fmt.Sprintf("ignored assets: %d/%d", counter-successful, counter))
}

if total != totalAssets {
return nil, errors.New("other assets do not match")
if assets == nil {
parser.addError(fmt.Errorf("failed when extracting assets"))
return nil, nil
}

// Reset variables for next call.
totalAssets = 0
assetsItemNumber = 0

return assets, nil
}

func getAssetValues(scanner *bufio.Scanner, index int, remaining bool) (values [7]string, nextPage bool) {
line, _ := getAssetLine(scanner)
for line != "" {

values[index] = line

// After reading all the possible values for a single item.
if index == 6 {
return
}

index++

line, nextPage = getAssetLine(scanner)
/*
Function to check if a given string is or not the header of the section.
Parameter: string s
Return: True or false
*/

func isAssetFormField(s string) bool {
formField := []string{
"DESCRIPCION",
"EMPRESA",
"RUC",
"PAIS",
"CANT.",
"PRECIO UNI.",
"IMPORTE",
}

if remaining {
return
s = removeAccents(s)
for _, value := range formField {
if !strings.Contains(s, value) {
return false
}
}

return [7]string{}, false
return true
}

func getAsset(scanner *bufio.Scanner, values [7]string) []*declaration.OtherAsset {
// En algunos casos, el importe del primer activo está al final de la lista
// de activos. Por ejemplo Juan Afara 2014
if !isNumber(values[6]) {
return getAsset2(scanner, values)
}
/*
Function to load the extracted values into the OtherAsset structure.
Parameters: values in an array of strings. The first element is not inserted because it is the index and not relevant.
Return: an instance of OtherAsset with the values from the array
*/

return []*declaration.OtherAsset{getAsset1(values)}
}

func getAsset1(values [7]string) *declaration.OtherAsset {
func getAsset(values []string) *declaration.OtherAsset {
return &declaration.OtherAsset{
Descripcion: values[0],
Empresa: values[1],
RUC: values[2],
Pais: values[3],
Cantidad: stringToInt64(values[4]),
Precio: stringToInt64(values[5]),
Importe: stringToInt64(values[6]),
}
}

func getAsset2(scanner *bufio.Scanner, values [7]string) []*declaration.OtherAsset {
assets := []*declaration.OtherAsset{}

firstAsset := getAsset1(values)
assets = append(assets, firstAsset)

assetsItemNumber++
skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber))

// values[6] is the descripcion in the second element.
tmp := values[6]
values, _ = getAssetValues(scanner, 1, false)
values[0] = tmp
secondAsset := getAsset1(values)
assets = append(assets, secondAsset)

// Skip next item number.
assetsItemNumber++
skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber))

values, nextPage := getAssetValues(scanner, 0, true)
counter := 0
for values[1] != "" && !nextPage {
assets = append(assets, getAsset1(values))

assetsItemNumber++
skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber))
counter++

values, nextPage = getAssetValues(scanner, 0, true)
}

// The last value is the importe for the first item.
firstAsset.Importe = stringToInt64(values[0])

// Restore skip assets to default state. The caller would remove the other
// remaining value.
skipAssets = skipAssets[:len(skipAssets)-counter-2]
assetsItemNumber = 1

return assets
}

func getAssetLine(scanner *bufio.Scanner) (line string, nextPage bool) {
for scanner.Scan() {
line = scanner.Text()

// Stop looking for assets when this is found.
if line == "TOTAL OTROS ACTIVOS" {
totalAssets = getTotalInCategory(scanner)

// Next page or end.
scanner = MoveUntil(scanner, "TIPO MUEBLES", true)
line = scanner.Text()
nextPage = true

assetsItemNumber = 1
skipAssets[len(skipAssets)-1] = strconv.Itoa(assetsItemNumber)
}

if strings.Contains(line, "OBS:") || strings.Contains(line, "RECEPCIONADO EL:") {
continue
}
if isDate(line) || isBarCode(line) {
continue
}
if line == "" || contains(skipAssets, line) {
continue
}

return line, nextPage
}

return "", false
}

func addAssets(assets []*declaration.OtherAsset) int64 {
var total int64
for _, a := range assets {
total += a.Importe
Descripcion: values[1],
Empresa: values[2],
RUC: values[3],
Pais: values[4],
Cantidad: stringToInt64(values[5]),
Precio: stringToInt64(values[6]),
Importe: stringToInt64(values[7]),
}

return total
}
12 changes: 5 additions & 7 deletions parser/extract/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ package extract

import (
"bufio"
"code.sajari.com/docconv"
"encoding/json"
"fmt"
"github.com/InstIDEA/ddjj/parser/declaration"
"io"
"strings"
"time"

"code.sajari.com/docconv"
"github.com/InstIDEA/ddjj/parser/declaration"
)

type ParserData struct {
Expand Down Expand Up @@ -156,11 +157,8 @@ func ParsePDF(file io.Reader) ParserData {
}

// Other assets
scanner = bufio.NewScanner(strings.NewReader(res.Body))
d.OtherAssets, err = Assets(scanner)
if err != nil {
parser.addError(err)
}
scanner = bufio.NewScanner(strings.NewReader(pl_res.Body))
d.OtherAssets, err = Assets(NewExtractor(pl_res.Body), &parser)

// Debts
scanner = bufio.NewScanner(strings.NewReader(res.Body))
Expand Down
20 changes: 19 additions & 1 deletion parser/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ package main

import (
"fmt"
"github.com/InstIDEA/ddjj/parser/extract"
"reflect"
"testing"

"github.com/InstIDEA/ddjj/parser/extract"
)

func TestDarioRamon(t *testing.T) {
Expand Down Expand Up @@ -194,6 +195,23 @@ func TestNataliaDure2019(t *testing.T) {
AssertEqual(t, "2019-03-07", data.Data.Fecha.Format("2006-01-02"))
}

func TestHorarioCartes2021(t *testing.T) {

data := handleSingleFile("./test_declarations/961570_HORACIO_MANUEL_CARTES_JARA.pdf")

if data.Data == nil {
t.Errorf("Error parsing the document")
}

data.Print()

AssertEqual(t, "HORACIO MANUEL", data.Data.Nombre)
AssertEqual(t, "2021-09-30", data.Data.Fecha.Format("2006-01-02"))
AssertEqual(t, int64(3384230397736), data.Data.Resumen.TotalActivo)
AssertEqual(t, int64(2256141600), data.Data.Resumen.TotalPasivo)
AssertEqual(t, int64(3381974256136), data.Data.Resumen.PatrimonioNeto)
}

// AssertEqual checks if values are equal
func AssertEqual(t *testing.T, want interface{}, got interface{}) {
if want == got {
Expand Down
Binary file not shown.