diff --git a/parser/extract/asset.go b/parser/extract/asset.go index d834737..0e14c5c 100644 --- a/parser/extract/asset.go +++ b/parser/extract/asset.go @@ -1,197 +1,156 @@ package extract import ( - "bufio" - "github.com/pkg/errors" - "strconv" + "fmt" "strings" "github.com/InstIDEA/ddjj/parser/declaration" ) -var totalAssets int64 - -var assetsItemNumber int - -var skipAssets = []string{ - "#", - "DESCRIPCIÓN", - "EMPRESA", - "RUC", - "PAÍS", - "CANT.", - "PRECIO UNI.", - "IMPORTE", -} - -// Assets returns other assets owned by the official. -func Assets(scanner *bufio.Scanner) ([]*declaration.OtherAsset, error) { - scanner = MoveUntil(scanner, "1.9 OTROS ACTIVOS", true) - - // Also wants to skip item number - assetsItemNumber = 1 - skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber)) - - var assets []*declaration.OtherAsset - - values, nextPage := getAssetValues(scanner, 0, false) - for values[0] != "" { - asset := getAsset(scanner, values) - assets = append(assets, asset...) - - if nextPage { - assetsItemNumber = 1 - } else { - assetsItemNumber++ +func Assets(e *Extractor, parser *ParserData) ([]*declaration.OtherAsset, error) { + var assets []*declaration.OtherAsset //lsit of extracted assets + asset := &declaration.OtherAsset{} //aux for the actual extraction + e.BindFlag(EXTRACTOR_FLAG_1) //remueve las lineas en blanco + e.BindFlag(EXTRACTOR_FLAG_2) //remueve los espacios en los extremos + //EXTRACTOR_FLAG_3 crea nuevos tokens siempre que dentro de la linea haya mas o igual a 3 espacios + var bandera bool + bandera = false + counter := 0 + successful := 0 + if e.MoveUntilStartWith(CurrToken, "1.9 OTROS ACTIVOS") { + for e.Scan() { + // other assets table header and OBS are omitted + if isAssetFormField(e.CurrToken) { + bandera = true //we are in the table records because we have the header + continue + } + if strings.Contains(e.CurrToken, "OBS:") && bandera { + counter++ + continue + } + // final of others assets of current page + if strings.Contains(e.CurrToken, "TOTAL OTROS ACTIVOS") { + bandera = false + } + //if the ban it's true, we can proceed with the extraction + if bandera { + values := tokenize(e.CurrToken, 3) + //case 1: Description is in two lines + //in this case the lines are + //descPart1 + //number of the register + //descPart2 + //rest of row + if len(values) == 1 && isNumber(e.CurrToken) { + description := e.PrevToken + " " + e.NextToken + // moving the current token to the next part + e.Scan() + e.Scan() + + //building the struct of other assets + fixed := []string{"#", description} + values = append(fixed, tokenize(e.CurrToken, 3)...) + } else + //case 2: Enterprise name is in two lines + //in this case the lines are + //enterprisePart1 + //number of the register + description + //enterprisePart2 + //rest of row + if len(values) == 2 { + enterpriseNamePart1 := e.PrevToken + //extracting the description of the currentToken thats saved on values array + description := values[1] + e.Scan() // we need to save the description in this part + allName := enterpriseNamePart1 + " " + e.CurrToken + //moving to the rest of the row + e.Scan() + + //building the struct of other assets + fixed := []string{"#", description, allName} + values = append(fixed, tokenize(e.CurrToken, 3)...) + + } else + //case 3: country in two lines + //namePart1 + //num + description + enterprise + ruc + //namePart2 + //cant + price + total + if len(values) == 4 { + country := e.PrevToken + " " + e.NextToken + description := values[1] + enterprise := values[2] + ruc := values[3] + // moving the current token to the next part + e.Scan() + e.Scan() + + //building the struct of other assets + fixed := []string{"#", description, enterprise, ruc, country} + values = append(fixed, tokenize(e.CurrToken, 4)...) + } + + if len(values) == 8 { + asset = getAsset(values) + assets = append(assets, asset) + } + } } - // Also wants to skip item number - skipAssets[len(skipAssets)-1] = strconv.Itoa(assetsItemNumber) - - values, nextPage = getAssetValues(scanner, 0, false) + successful = len(assets) } - - total := addAssets(assets) - if total == 0 { - return nil, errors.New("failed when extracting other assets") + if successful != counter { + parser.addMessage(fmt.Sprintf("ignored assets: %d/%d", counter-successful, counter)) } - if total != totalAssets { - return nil, errors.New("other assets do not match") + if assets == nil { + parser.addError(fmt.Errorf("failed when extracting assets")) + return nil, nil } - // Reset variables for next call. - totalAssets = 0 - assetsItemNumber = 0 - return assets, nil } -func getAssetValues(scanner *bufio.Scanner, index int, remaining bool) (values [7]string, nextPage bool) { - line, _ := getAssetLine(scanner) - for line != "" { - - values[index] = line - - // After reading all the possible values for a single item. - if index == 6 { - return - } - - index++ - - line, nextPage = getAssetLine(scanner) +/* +Function to check if a given string is or not the header of the section. +Parameter: string s +Return: True or false +*/ + +func isAssetFormField(s string) bool { + formField := []string{ + "DESCRIPCION", + "EMPRESA", + "RUC", + "PAIS", + "CANT.", + "PRECIO UNI.", + "IMPORTE", } - if remaining { - return + s = removeAccents(s) + for _, value := range formField { + if !strings.Contains(s, value) { + return false + } } - return [7]string{}, false + return true } -func getAsset(scanner *bufio.Scanner, values [7]string) []*declaration.OtherAsset { - // En algunos casos, el importe del primer activo está al final de la lista - // de activos. Por ejemplo Juan Afara 2014 - if !isNumber(values[6]) { - return getAsset2(scanner, values) - } +/* +Function to load the extracted values into the OtherAsset structure. +Parameters: values in an array of strings. The first element is not inserted because it is the index and not relevant. +Return: an instance of OtherAsset with the values from the array +*/ - return []*declaration.OtherAsset{getAsset1(values)} -} - -func getAsset1(values [7]string) *declaration.OtherAsset { +func getAsset(values []string) *declaration.OtherAsset { return &declaration.OtherAsset{ - Descripcion: values[0], - Empresa: values[1], - RUC: values[2], - Pais: values[3], - Cantidad: stringToInt64(values[4]), - Precio: stringToInt64(values[5]), - Importe: stringToInt64(values[6]), - } -} - -func getAsset2(scanner *bufio.Scanner, values [7]string) []*declaration.OtherAsset { - assets := []*declaration.OtherAsset{} - - firstAsset := getAsset1(values) - assets = append(assets, firstAsset) - - assetsItemNumber++ - skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber)) - - // values[6] is the descripcion in the second element. - tmp := values[6] - values, _ = getAssetValues(scanner, 1, false) - values[0] = tmp - secondAsset := getAsset1(values) - assets = append(assets, secondAsset) - - // Skip next item number. - assetsItemNumber++ - skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber)) - - values, nextPage := getAssetValues(scanner, 0, true) - counter := 0 - for values[1] != "" && !nextPage { - assets = append(assets, getAsset1(values)) - - assetsItemNumber++ - skipAssets = append(skipAssets, strconv.Itoa(assetsItemNumber)) - counter++ - - values, nextPage = getAssetValues(scanner, 0, true) - } - - // The last value is the importe for the first item. - firstAsset.Importe = stringToInt64(values[0]) - - // Restore skip assets to default state. The caller would remove the other - // remaining value. - skipAssets = skipAssets[:len(skipAssets)-counter-2] - assetsItemNumber = 1 - - return assets -} - -func getAssetLine(scanner *bufio.Scanner) (line string, nextPage bool) { - for scanner.Scan() { - line = scanner.Text() - - // Stop looking for assets when this is found. - if line == "TOTAL OTROS ACTIVOS" { - totalAssets = getTotalInCategory(scanner) - - // Next page or end. - scanner = MoveUntil(scanner, "TIPO MUEBLES", true) - line = scanner.Text() - nextPage = true - - assetsItemNumber = 1 - skipAssets[len(skipAssets)-1] = strconv.Itoa(assetsItemNumber) - } - - if strings.Contains(line, "OBS:") || strings.Contains(line, "RECEPCIONADO EL:") { - continue - } - if isDate(line) || isBarCode(line) { - continue - } - if line == "" || contains(skipAssets, line) { - continue - } - - return line, nextPage - } - - return "", false -} - -func addAssets(assets []*declaration.OtherAsset) int64 { - var total int64 - for _, a := range assets { - total += a.Importe + Descripcion: values[1], + Empresa: values[2], + RUC: values[3], + Pais: values[4], + Cantidad: stringToInt64(values[5]), + Precio: stringToInt64(values[6]), + Importe: stringToInt64(values[7]), } - - return total } diff --git a/parser/extract/parser.go b/parser/extract/parser.go index 0e32788..5108cdd 100644 --- a/parser/extract/parser.go +++ b/parser/extract/parser.go @@ -2,13 +2,14 @@ package extract import ( "bufio" - "code.sajari.com/docconv" "encoding/json" "fmt" - "github.com/InstIDEA/ddjj/parser/declaration" "io" "strings" "time" + + "code.sajari.com/docconv" + "github.com/InstIDEA/ddjj/parser/declaration" ) type ParserData struct { @@ -156,11 +157,8 @@ func ParsePDF(file io.Reader) ParserData { } // Other assets - scanner = bufio.NewScanner(strings.NewReader(res.Body)) - d.OtherAssets, err = Assets(scanner) - if err != nil { - parser.addError(err) - } + scanner = bufio.NewScanner(strings.NewReader(pl_res.Body)) + d.OtherAssets, err = Assets(NewExtractor(pl_res.Body), &parser) // Debts scanner = bufio.NewScanner(strings.NewReader(res.Body)) diff --git a/parser/main_test.go b/parser/main_test.go index e21317a..c4f693d 100644 --- a/parser/main_test.go +++ b/parser/main_test.go @@ -2,9 +2,10 @@ package main import ( "fmt" - "github.com/InstIDEA/ddjj/parser/extract" "reflect" "testing" + + "github.com/InstIDEA/ddjj/parser/extract" ) func TestDarioRamon(t *testing.T) { @@ -194,6 +195,23 @@ func TestNataliaDure2019(t *testing.T) { AssertEqual(t, "2019-03-07", data.Data.Fecha.Format("2006-01-02")) } +func TestHorarioCartes2021(t *testing.T) { + + data := handleSingleFile("./test_declarations/961570_HORACIO_MANUEL_CARTES_JARA.pdf") + + if data.Data == nil { + t.Errorf("Error parsing the document") + } + + data.Print() + + AssertEqual(t, "HORACIO MANUEL", data.Data.Nombre) + AssertEqual(t, "2021-09-30", data.Data.Fecha.Format("2006-01-02")) + AssertEqual(t, int64(3384230397736), data.Data.Resumen.TotalActivo) + AssertEqual(t, int64(2256141600), data.Data.Resumen.TotalPasivo) + AssertEqual(t, int64(3381974256136), data.Data.Resumen.PatrimonioNeto) +} + // AssertEqual checks if values are equal func AssertEqual(t *testing.T, want interface{}, got interface{}) { if want == got { diff --git a/parser/test_declarations/961570_HORACIO_MANUEL_CARTES_JARA.pdf b/parser/test_declarations/961570_HORACIO_MANUEL_CARTES_JARA.pdf new file mode 100644 index 0000000..cac578f Binary files /dev/null and b/parser/test_declarations/961570_HORACIO_MANUEL_CARTES_JARA.pdf differ