Skip to content

changes in debtors extractor in order to resolve issue Error parsing … #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 70 additions & 95 deletions parser/extract/debtors.go
Original file line number Diff line number Diff line change
@@ -1,122 +1,97 @@
package extract

import (
"bufio"
"errors"
"strconv"
"fmt"
"strings"

"github.com/InstIDEA/ddjj/parser/declaration"
)

// Debtors returns the debts people have with the official.
func Debtors(scanner *bufio.Scanner) ([]*declaration.Debtor, error) {
var skip = []string{
"#",
"NOMBRE DEL DEUDOR",
"CLASE (A LA VISTA O PLAZOS)",
"PLAZO EN",
"IMPORTE",
}

scanner = MoveUntil(scanner, "1.3 CUENTAS A COBRAR", true)

var debtors []*declaration.Debtor
opts := &debtorOpts{
debtor: &declaration.Debtor{},
counter: 0,
}

index := 1
skip = append(skip, strconv.Itoa(index))
var total int64
for scanner.Scan() {
line := scanner.Text()

// Stop looking for debtors when this is found.
if line == "TOTAL CUENTAS POR COBRAR:" {
total = getTotalInCategory(scanner)

// Next page or end.
scanner = MoveUntil(scanner, "NOMBRE DEL DEUDOR", true)
line = scanner.Text()
if line == "" {
break
func Debtors(e *Extractor, parser *ParserData) ([]*declaration.Debtor, error) {
var debtors []*declaration.Debtor //lsit of extracted debtors
debt := &declaration.Debtor{} //aux for the actual extraction
e.BindFlag(EXTRACTOR_FLAG_1) //remueve las lineas en blanco
e.BindFlag(EXTRACTOR_FLAG_2) //remueve los espacios en los extremos
//EXTRACTOR_FLAG_3 crea nuevos tokens siempre que dentro de la linea haya mas o igual a 3 espacios
var bandera bool
bandera = false
counter := 0
successful := 0
if e.MoveUntilStartWith(CurrToken, "1.3 CUENTAS A COBRAR") {
for e.Scan() {
// other assets table header and OBS are omitted
if isAssetFormField(e.CurrToken) {
bandera = true //we are in the table records because we have the header
continue
}

index = 1
}

if strings.Contains(line, "OBS:") {
continue
}
if contains(skip, line) || line == "" {
if line == strconv.Itoa(index) {
// Delete the index to avoid confusion with Plazo.
skip = skip[:len(skip)-1]
if strings.Contains(e.CurrToken, "OBS:") && bandera {
counter++
continue
}
// final of others assets of current page
if strings.Contains(e.CurrToken, "TOTAL CUENTAS POR COBRAR:") {
bandera = false
}
//if the ban it's true, we can proceed with the extraction
if bandera {
values := tokenize(e.CurrToken, 3)
if len(values) == 5 {
debt = detDebtor(values)
debtors = append(debtors, debt)
}
}
continue
}

d := getDebtor(opts, line)
if d != nil {
debtors = append(debtors, d)
opts.counter = -1
opts.debtor = &declaration.Debtor{}

// Skip the following item #.
index++
skip[len(skip)-1] = strconv.Itoa(index)
}

opts.counter++
successful = len(debtors)
}

totalDebtors := addDebtors(debtors)

if total == 0 {
return nil, errors.New("failed when extracting debtors")
if successful != counter {
parser.addMessage(fmt.Sprintf("ignored debtors: %d/%d", counter-successful, counter))
}

if totalDebtors != total {
return nil, errors.New("debtors do not match")
if debtors == nil {
parser.addError(fmt.Errorf("failed when extracting debtors"))
return nil, nil
}

return debtors, nil
}

type debtorOpts struct {
debtor *declaration.Debtor
counter int
}
/*
Function to check if a given string is or not the header of the section.
Parameter: string s
Return: True or false
*/

func getDebtor(opts *debtorOpts, line string) *declaration.Debtor {
switch opts.counter {
case 0:
opts.debtor.Nombre = line
break
case 1:
opts.debtor.Clase = line
break
case 2:
value, _ := strconv.Atoi(line)
opts.debtor.Plazo = value
break
case 3:
value := strings.ReplaceAll(line, ".", "")
i, _ := strconv.ParseInt(value, 10, 64)
opts.debtor.Importe = i
return opts.debtor
func isAssetFormField(s string) bool {
formField := []string{
"#",
"NOMBRE DEL DEUDOR",
"CLASE (A LA VISTA O PLAZOS)",
"PLAZO EN",
"IMPORTE",
}

return nil
s = removeAccents(s)
for _, value := range formField {
if !strings.Contains(s, value) {
return false
}
}

return true
}

func addDebtors(debtors []*declaration.Debtor) int64 {
var total int64
for _, d := range debtors {
total += d.Importe
/*
Function to load the extracted values into the OtherAsset structure.
Parameters: values in an array of strings. The first element is not inserted because it is the index and not relevant.
Return: an instance of OtherAsset with the values from the array
*/

func detDebtor(values []string) *declaration.Debtor {
return &declaration.Debtor{
Nombre: values[1],
Clase: values[2],
Plazo: stringToInt(values[3]),
Importe: stringToInt64(values[4]),
}

return total
}
12 changes: 5 additions & 7 deletions parser/extract/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ package extract

import (
"bufio"
"code.sajari.com/docconv"
"encoding/json"
"fmt"
"github.com/InstIDEA/ddjj/parser/declaration"
"io"
"strings"
"time"

"code.sajari.com/docconv"
"github.com/InstIDEA/ddjj/parser/declaration"
)

type ParserData struct {
Expand Down Expand Up @@ -124,11 +125,8 @@ func ParsePDF(file io.Reader) ParserData {
}

// Debtors.
scanner = bufio.NewScanner(strings.NewReader(res.Body))
d.Debtors, err = Debtors(scanner)
if err != nil {
parser.addError(err)
}
scanner = bufio.NewScanner(strings.NewReader(pl_res.Body))
d.Debtors, err = Debtors(NewExtractor(pl_res.Body), &parser)

// Real state.
scanner = bufio.NewScanner(strings.NewReader(res.Body))
Expand Down
35 changes: 34 additions & 1 deletion parser/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ package main

import (
"fmt"
"github.com/InstIDEA/ddjj/parser/extract"
"reflect"
"testing"

"github.com/InstIDEA/ddjj/parser/extract"
)

func TestDarioRamon(t *testing.T) {
Expand Down Expand Up @@ -194,6 +195,38 @@ func TestNataliaDure2019(t *testing.T) {
AssertEqual(t, "2019-03-07", data.Data.Fecha.Format("2006-01-02"))
}

func TestEddyNeufeld2016(t *testing.T) {

data := handleSingleFile("./test_declarations/2024982_9fb18b249891f3e2f290e33e588d98b1.pdf")

if data.Data == nil {
t.Errorf("Error parsing the document")
}

for _, item := range data.Message {
fmt.Println(item)
}

fmt.Printf("\n\n")
fmt.Println("Nombre: ", data.Data.Nombre)
fmt.Println("Fecha: ", data.Data.Fecha)
fmt.Println("Conyuge: ", data.Data.Conyuge)
fmt.Println("Cargo: ", data.Data.Instituciones[0].Cargo)
fmt.Println("Institucion: ", data.Data.Instituciones[0].Institucion)
fmt.Println("Resumen Activos: ", data.Data.Resumen.TotalActivo)
fmt.Println("Resumen Pasivos: ", data.Data.Resumen.TotalPasivo)
fmt.Println("Resumen Patrimonio Neto: ", data.Data.Resumen.PatrimonioNeto)

AssertEqual(t, "EDDY", data.Data.Nombre)
AssertEqual(t, "2016-01-04", data.Data.Fecha.Format("2006-01-02"))
AssertEqual(t, "INTENDENTE MUNICIPAL", data.Data.Instituciones[0].Cargo)
AssertEqual(t, "MUNICIPALIDAD DE RAUL ARSENIO OVIEDO", data.Data.Instituciones[0].Institucion)
AssertEqual(t, "MIRNA ELIZABETH FLORENCIAÑEZ NEUFELD", data.Data.Conyuge)
AssertEqual(t, int64(108601862791), data.Data.Resumen.TotalActivo)
AssertEqual(t, int64(38970873094), data.Data.Resumen.TotalPasivo)
AssertEqual(t, int64(69630989697), data.Data.Resumen.PatrimonioNeto)
}

// AssertEqual checks if values are equal
func AssertEqual(t *testing.T, want interface{}, got interface{}) {
if want == got {
Expand Down
Binary file not shown.