Skip to content

Corrección de extracción de otros activos #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
299 changes: 129 additions & 170 deletions parser/extract/asset.go
Original file line number Diff line number Diff line change
@@ -1,197 +1,156 @@
package extract

import (
"bufio"
"github.com/pkg/errors"
"strconv"
"fmt"
"strings"

"github.com/InstIDEA/ddjj/parser/declaration"
)

var totalAssets int64

var assetsItemNumber int

var skipAssets = []string{
"#",
"DESCRIPCIÓN",
"EMPRESA",
"RUC",
"PAÍS",
"CANT.",
"PRECIO UNI.",
"IMPORTE",
}

// Assets returns other assets owned by the official.
func Assets(scanner *bufio.Scanner) ([]*declaration.OtherAsset, error) {
scanner = MoveUntil(scanner, "1.9 OTROS ACTIVOS", true)

// Also wants to skip item number
assetsItemNumber = 1
skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber))

var assets []*declaration.OtherAsset

values, nextPage := getAssetValues(scanner, 0, false)
for values[0] != "" {
asset := getAsset(scanner, values)
assets = append(assets, asset...)

if nextPage {
assetsItemNumber = 1
} else {
assetsItemNumber++
func Assets(e *Extractor, parser *ParserData) ([]*declaration.OtherAsset, error) {
var assets []*declaration.OtherAsset //lsit of extracted assets
asset := &declaration.OtherAsset{} //aux for the actual extraction
e.BindFlag(EXTRACTOR_FLAG_1) //remueve las lineas en blanco
e.BindFlag(EXTRACTOR_FLAG_2) //remueve los espacios en los extremos
//EXTRACTOR_FLAG_3 crea nuevos tokens siempre que dentro de la linea haya mas o igual a 3 espacios
var bandera bool
bandera = false
counter := 0
successful := 0
if e.MoveUntilStartWith(CurrToken, "1.9 OTROS ACTIVOS") {
for e.Scan() {
// other assets table header and OBS are omitted
if isAssetFormField(e.CurrToken) {
bandera = true //we are in the table records because we have the header
continue
}
if strings.Contains(e.CurrToken, "OBS:") && bandera {
counter++
continue
}
// final of others assets of current page
if strings.Contains(e.CurrToken, "TOTAL OTROS ACTIVOS") {
bandera = false
}
//if the ban it's true, we can proceed with the extraction
if bandera {
values := tokenize(e.CurrToken, 3)
//case 1: Description is in two lines
//in this case the lines are
//descPart1
//number of the register
//descPart2
//rest of row
if len(values) == 1 && isNumber(e.CurrToken) {
description := e.PrevToken + " " + e.NextToken
// moving the current token to the next part
e.Scan()
e.Scan()

//building the struct of other assets
fixed := []string{"#", description}
values = append(fixed, tokenize(e.CurrToken, 3)...)
} else
//case 2: Enterprise name is in two lines
//in this case the lines are
//enterprisePart1
//number of the register + description
//enterprisePart2
//rest of row
if len(values) == 2 {
enterpriseNamePart1 := e.PrevToken
//extracting the description of the currentToken thats saved on values array
description := values[1]
e.Scan() // we need to save the description in this part
allName := enterpriseNamePart1 + " " + e.CurrToken
//moving to the rest of the row
e.Scan()

//building the struct of other assets
fixed := []string{"#", description, allName}
values = append(fixed, tokenize(e.CurrToken, 3)...)

} else
//case 3: country in two lines
//namePart1
//num + description + enterprise + ruc
//namePart2
//cant + price + total
if len(values) == 4 {
country := e.PrevToken + " " + e.NextToken
description := values[1]
enterprise := values[2]
ruc := values[3]
// moving the current token to the next part
e.Scan()
e.Scan()

//building the struct of other assets
fixed := []string{"#", description, enterprise, ruc, country}
values = append(fixed, tokenize(e.CurrToken, 4)...)
}

if len(values) == 8 {
asset = getAsset(values)
assets = append(assets, asset)
}
}
}
// Also wants to skip item number
skipAssets[len(skipAssets)-1] = strconv.Itoa(assetsItemNumber)

values, nextPage = getAssetValues(scanner, 0, false)
successful = len(assets)
}

total := addAssets(assets)
if total == 0 {
return nil, errors.New("failed when extracting other assets")
if successful != counter {
parser.addMessage(fmt.Sprintf("ignored assets: %d/%d", counter-successful, counter))
}

if total != totalAssets {
return nil, errors.New("other assets do not match")
if assets == nil {
parser.addError(fmt.Errorf("failed when extracting assets"))
return nil, nil
}

// Reset variables for next call.
totalAssets = 0
assetsItemNumber = 0

return assets, nil
}

func getAssetValues(scanner *bufio.Scanner, index int, remaining bool) (values [7]string, nextPage bool) {
line, _ := getAssetLine(scanner)
for line != "" {

values[index] = line

// After reading all the possible values for a single item.
if index == 6 {
return
}

index++

line, nextPage = getAssetLine(scanner)
/*
Function to check if a given string is or not the header of the section.
Parameter: string s
Return: True or false
*/

func isAssetFormField(s string) bool {
formField := []string{
"DESCRIPCION",
"EMPRESA",
"RUC",
"PAIS",
"CANT.",
"PRECIO UNI.",
"IMPORTE",
}

if remaining {
return
s = removeAccents(s)
for _, value := range formField {
if !strings.Contains(s, value) {
return false
}
}

return [7]string{}, false
return true
}

func getAsset(scanner *bufio.Scanner, values [7]string) []*declaration.OtherAsset {
// En algunos casos, el importe del primer activo está al final de la lista
// de activos. Por ejemplo Juan Afara 2014
if !isNumber(values[6]) {
return getAsset2(scanner, values)
}
/*
Function to load the extracted values into the OtherAsset structure.
Parameters: values in an array of strings. The first element is not inserted because it is the index and not relevant.
Return: an instance of OtherAsset with the values from the array
*/

return []*declaration.OtherAsset{getAsset1(values)}
}

func getAsset1(values [7]string) *declaration.OtherAsset {
func getAsset(values []string) *declaration.OtherAsset {
return &declaration.OtherAsset{
Descripcion: values[0],
Empresa: values[1],
RUC: values[2],
Pais: values[3],
Cantidad: stringToInt64(values[4]),
Precio: stringToInt64(values[5]),
Importe: stringToInt64(values[6]),
}
}

func getAsset2(scanner *bufio.Scanner, values [7]string) []*declaration.OtherAsset {
assets := []*declaration.OtherAsset{}

firstAsset := getAsset1(values)
assets = append(assets, firstAsset)

assetsItemNumber++
skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber))

// values[6] is the descripcion in the second element.
tmp := values[6]
values, _ = getAssetValues(scanner, 1, false)
values[0] = tmp
secondAsset := getAsset1(values)
assets = append(assets, secondAsset)

// Skip next item number.
assetsItemNumber++
skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber))

values, nextPage := getAssetValues(scanner, 0, true)
counter := 0
for values[1] != "" && !nextPage {
assets = append(assets, getAsset1(values))

assetsItemNumber++
skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber))
counter++

values, nextPage = getAssetValues(scanner, 0, true)
}

// The last value is the importe for the first item.
firstAsset.Importe = stringToInt64(values[0])

// Restore skip assets to default state. The caller would remove the other
// remaining value.
skipAssets = skipAssets[:len(skipAssets)-counter-2]
assetsItemNumber = 1

return assets
}

func getAssetLine(scanner *bufio.Scanner) (line string, nextPage bool) {
for scanner.Scan() {
line = scanner.Text()

// Stop looking for assets when this is found.
if line == "TOTAL OTROS ACTIVOS" {
totalAssets = getTotalInCategory(scanner)

// Next page or end.
scanner = MoveUntil(scanner, "TIPO MUEBLES", true)
line = scanner.Text()
nextPage = true

assetsItemNumber = 1
skipAssets[len(skipAssets)-1] = strconv.Itoa(assetsItemNumber)
}

if strings.Contains(line, "OBS:") || strings.Contains(line, "RECEPCIONADO EL:") {
continue
}
if isDate(line) || isBarCode(line) {
continue
}
if line == "" || contains(skipAssets, line) {
continue
}

return line, nextPage
}

return "", false
}

func addAssets(assets []*declaration.OtherAsset) int64 {
var total int64
for _, a := range assets {
total += a.Importe
Descripcion: values[1],
Empresa: values[2],
RUC: values[3],
Pais: values[4],
Cantidad: stringToInt64(values[5]),
Precio: stringToInt64(values[6]),
Importe: stringToInt64(values[7]),
}

return total
}
12 changes: 5 additions & 7 deletions parser/extract/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ package extract

import (
"bufio"
"code.sajari.com/docconv"
"encoding/json"
"fmt"
"github.com/InstIDEA/ddjj/parser/declaration"
"io"
"strings"
"time"

"code.sajari.com/docconv"
"github.com/InstIDEA/ddjj/parser/declaration"
)

type ParserData struct {
Expand Down Expand Up @@ -156,11 +157,8 @@ func ParsePDF(file io.Reader) ParserData {
}

// Other assets
scanner = bufio.NewScanner(strings.NewReader(res.Body))
d.OtherAssets, err = Assets(scanner)
if err != nil {
parser.addError(err)
}
scanner = bufio.NewScanner(strings.NewReader(pl_res.Body))
d.OtherAssets, err = Assets(NewExtractor(pl_res.Body), &parser)

// Debts
scanner = bufio.NewScanner(strings.NewReader(res.Body))
Expand Down
20 changes: 19 additions & 1 deletion parser/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ package main

import (
"fmt"
"github.com/InstIDEA/ddjj/parser/extract"
"reflect"
"testing"

"github.com/InstIDEA/ddjj/parser/extract"
)

func TestDarioRamon(t *testing.T) {
Expand Down Expand Up @@ -194,6 +195,23 @@ func TestNataliaDure2019(t *testing.T) {
AssertEqual(t, "2019-03-07", data.Data.Fecha.Format("2006-01-02"))
}

func TestHorarioCartes2021(t *testing.T) {

data := handleSingleFile("./test_declarations/961570_HORACIO_MANUEL_CARTES_JARA.pdf")

if data.Data == nil {
t.Errorf("Error parsing the document")
}

data.Print()

AssertEqual(t, "HORACIO MANUEL", data.Data.Nombre)
AssertEqual(t, "2021-09-30", data.Data.Fecha.Format("2006-01-02"))
AssertEqual(t, int64(3384230397736), data.Data.Resumen.TotalActivo)
AssertEqual(t, int64(2256141600), data.Data.Resumen.TotalPasivo)
AssertEqual(t, int64(3381974256136), data.Data.Resumen.PatrimonioNeto)
}

// AssertEqual checks if values are equal
func AssertEqual(t *testing.T, want interface{}, got interface{}) {
if want == got {
Expand Down
Binary file not shown.