Parsers configuration now on parsers.yaml

parsers.py redid with single default parser that takes configurating
parameters from a yaml file.
Additional parsing configuration can be achieved with the additional_parser
attribute on yaml and rewriting the func and parser method on child
classes of Parser.
func will be called after each transaction is created and the parser
should call the parent parser method or rewrite the entire parser
process.
The parse_data function is now called from the runnable and the parsing
process is now called from there. The parse command can take an optional
bank before is tries to extract it from the filename and multiple paths,
either files or directories.
The Transaction __init__ was fixed to take inputs from previously
initiated Transaction.
Also adds utils.py with helper functions.
This commit is contained in:
Luís Murta 2021-06-03 23:30:32 +01:00
parent 4d9e8edec8
commit 4d6c865bb5
Signed by: satprog
GPG Key ID: DDF2EFC6179009DC
7 changed files with 273 additions and 187 deletions

View File

@ -3,7 +3,7 @@
parsing -> categorize -> analyze (predict) parsing -> categorize -> analyze (predict)
## Parsing ## Parsing
Parses bank extracts, based on parsers.py functions, to SQLite db. Parses bank extracts, based on parsers.yaml, to a SQLite database.
## Categorize ## Categorize
Categorizes transactions based on categories.py filters. Categorizes transactions based on categories.py filters.

60
parsers.yaml Normal file
View File

@ -0,0 +1,60 @@
Banks:
- Bank1
- Bank2
- Bank3
CreditCards:
- VISA
- MasterCard
- AmericanExpress
default: &default
encoding: utf-8
separator: "\t"
date_fmt: "%Y-%m-%d"
Bank1:
<<: *default
separator: ";"
date_fmt: "%d/%m/%Y"
start: 6
debit:
date: 1
text: 3
value: 4
additional_parser: true
Bank2: &bank2
<<: *default
date_fmt: "%d/%m/%Y"
debit:
date: 0
text: 2
value: 3
VISA:
<<: *bank2
debit:
date: 0
text: 2
value: 3
credit:
date: 1
text: 2
value: 3
Bank3:
encoding: windows-1252
separator: ","
date_fmt: "%d-%m-%Y"
start: 8
end: -1
debit:
date: 1
text: 2
value: 3
negate: true
credit:
date: 0
text: 2
value: 4

View File

@ -1,198 +1,151 @@
from datetime import datetime from collections import namedtuple
from decimal import Decimal from decimal import Decimal
from pathlib import Path from importlib import import_module
from typing import Final
import datetime as dt
import yaml
from .transactions import Transaction from .transactions import Transaction
from . import utils
def parse_data(file: Path, append=False): cfg: Final = yaml.safe_load(open("parsers.yaml"))
name = file.stem.split("_") assert (
try: "Banks" in cfg
bank, _ = name[0], int(name[1]) ), "parsers.yaml is missing the Banks section with the list of available banks"
except ValueError:
_, bank = int(name[0]), name[1]
p = dict( Index = namedtuple(
Bank1=Bank1, "Index", ["date", "text", "value", "negate"], defaults=[-1, -1, -1, False]
Bank2=Bank2, )
Bank2CC=Bank2CC, Options = namedtuple(
BANK3=Bank3, "Options",
[
"encoding",
"separator",
"date_fmt",
"start",
"end",
"debit",
"credit",
"additional_parser",
"VISA",
"MasterCard",
"AmericanExpress",
],
defaults=["", "", "", 1, None, Index(), Index(), False, None, None, None],
) )
try:
parser = p[bank]()
except KeyError as e:
print(f"{e} {bank} parser doesnt exist. Cant parse {name}")
return
transactions = parser.parse(file) def parse_data(filename: str, bank=None) -> list:
if not bank:
bank, creditcard = utils.find_credit_institution(
filename, cfg.get("Banks"), cfg.get("CreditCards")
)
if creditcard:
options = cfg[bank][creditcard]
bank += creditcard
else:
options = cfg[bank]
if options.get("additional_parser", False):
parser = getattr(import_module("pfbudget.parsers"), bank)
transactions = parser(filename, bank, options).parse()
else:
transactions = Parser(filename, bank, options).parse()
return transactions return transactions
def transaction(line: str, bank: str, options: Options, func) -> Transaction:
line = line.rstrip().split(options.separator)
index = Parser.index(line, options)
date = (
dt.datetime.strptime(line[index.date].strip(), options.date_fmt)
.date()
.isoformat()
)
text = line[index.text]
value = utils.parse_decimal(line[index.value])
if index.negate:
value = -value
transaction = Transaction(date, text, bank, value)
if options.additional_parser:
func(transaction)
return transaction
class Parser: class Parser:
def parse(self, file): def __init__(self, filename: str, bank: str, options: dict):
self.filename = filename
self.bank = bank
if debit := options.get("debit", None):
options["debit"] = Index(**debit)
if credit := options.get("credit", None):
options["credit"] = Index(**credit)
self.options = Options(**options)
def func(self, transaction: Transaction):
pass pass
def parse(self) -> list:
transactions = [
transaction(line, self.bank, self.options, self.func)
for line in list(open(self.filename, encoding=self.options.encoding))[
self.options.start - 1 : self.options.end
]
]
return transactions
@staticmethod
def index(line: list, options: Options) -> Index:
if options.debit.date != -1 and options.credit.date != -1:
if options.debit.value != options.credit.value:
if line[options.debit.value]:
index = options.debit
elif line[options.credit.value]:
index = options.credit
elif options.debit.date != options.credit.date:
if line[options.debit.date]:
index = options.debit
elif line[options.credit.date]:
index = options.credit
elif options.debit.text != options.credit.text:
if line[options.debit.text]:
index = options.debit
elif line[options.credit.text]:
index = options.credit
else:
raise IndexError("Debit and credit indexes are equal")
elif options.debit.date != -1:
index = options.debit
elif options.credit.date != -1:
index = options.credit
else:
raise IndexError("No debit not credit indexes available")
return index
class Bank1(Parser): class Bank1(Parser):
"""Bank 1 parser def __init__(self, filename: str, bank: str, options: dict):
super().__init__(filename, bank, options)
self.transfers = []
self.transaction_cost = -Decimal("1")
Bank 1 transcripts have the following properties: def func(self, transaction: Transaction):
encoding: utf-8 if "transf" in transaction.description.lower() and transaction.value < 0:
separator: ; transaction.value -= self.transaction_cost
starting line: 5 self.transfers.append(transaction.date)
date format: %d/%m/%Y
The reading order is reversed to go from earlier to latest.
"""
encoding = "utf-8"
separator = ";"
def parse(self, file):
transactions = []
reader = [
line.rstrip().split(self.separator)
for line in open(file, encoding=self.encoding)
][5:]
for transaction in reversed(reader):
transaction = [field.rstrip() for field in transaction]
date = datetime.strptime(transaction[1], "%d/%m/%Y").date()
description = " ".join(transaction[3].split())
value = Decimal(transaction[4])
def parse(self) -> list:
transactions = super().parse()
for date in self.transfers:
transactions.append( transactions.append(
Transaction(date.isoformat(), description, "Bank1", value) Transaction(date, "Transaction cost", self.bank, self.transaction_cost)
) )
return transactions
class Bank2(Parser):
"""Bank 2 parser
Bank 2 transcripts have the following properties:
encoding: utf-8
separator: tab
date format: %d/%m/%Y or %d-%m-%Y
decimal separator: ,
"""
encoding = "utf-8"
separator = "\t"
def parse(self, file):
transactions = []
reader = [
line.rstrip().split(self.separator)
for line in open(file, encoding=self.encoding)
]
for transaction in reader:
try:
date = datetime.strptime(transaction[0], "%d/%m/%Y").date()
except ValueError: # date can differ due to locales
date = datetime.strptime(transaction[0], "%d-%m-%Y").date()
description = transaction[2]
# works for US and EU locales (5,000.00 and 5 000,00)
value = list(transaction[3].replace("\xa0", "")) # non-breaking space
value[-3] = "."
value = "".join(value)
value = value.replace(",", "")
value = Decimal(value)
transactions.append(
Transaction(date.isoformat(), description, "Bank2", value)
)
return transactions
class Bank2CC(Parser):
"""Bank 2 credit card parser
Bank 2 credit card transcripts have the following properties:
encoding: utf-8
separator: tab
date format: %d/%m/%Y or %d-%m-%Y
decimal separator: ,
"""
encoding = "utf-8"
separator = "\t"
def parse(self, file):
transactions = []
reader = [
line.rstrip().split(self.separator)
for line in open(file, encoding=self.encoding)
]
for transaction in reader:
try:
date = datetime.strptime(transaction[0], "%d/%m/%Y").date()
except ValueError: # date can differ due to locales
date = datetime.strptime(transaction[0], "%d-%m-%Y").date()
description = transaction[2]
# works for US and EU locales (5,000.00 and 5 000,00)
value = list(transaction[3].replace("\xa0", "")) # non-breaking space
value[-3] = "."
value = "".join(value)
value = value.replace(",", "")
value = Decimal(value)
if value > 0:
date = datetime.strptime(transaction[1], "%d/%m/%Y").date()
transactions.append(
Transaction(date.isoformat(), description, "Bank2CC", value)
)
return transactions
class Bank3(Parser):
"""Bank 3 parser
Bank 3 transcripts have the following properties:
encoding: windows-1252 (passed as argument)
separator: ;
starting line: 7
finishing line: -1
date format: %d-%m-%Y
decimal separator: ,
thousands separator: .
Bank 3 has credits in a different column from debits. These also have to be
negated. The reading order is reversed to go from earlier to latest.
"""
encoding = "windows-1252"
separator = ","
def parse(self, file):
transactions = []
reader = [
line.rstrip().split(self.separator)
for line in open(file, encoding=self.encoding)
][7:-1]
for transaction in reversed(reader):
transaction = [field.rstrip() for field in transaction]
date = datetime.strptime(transaction[1], "%d-%m-%Y").date()
description = transaction[2]
if t := transaction[3]:
t = t.replace(".", "").replace(",", ".")
value = -Decimal(t)
else:
t = transaction[4].replace(".", "").replace(",", ".")
value = Decimal(t)
transactions.append(
Transaction(date.isoformat(), description, "Bank3", value)
)
return transactions return transactions

View File

@ -4,6 +4,7 @@ import datetime as dt
from .database import DBManager from .database import DBManager
from .graph import discrete, monthly from .graph import discrete, monthly
from .parsers import parse_data
from .transactions import load_transactions, save_transactions from .transactions import load_transactions, save_transactions
from . import report from . import report
from . import tools from . import tools
@ -45,7 +46,13 @@ def argparser():
p_export = subparsers.add_parser("export", help="export help") p_export = subparsers.add_parser("export", help="export help")
p_export.set_defaults(func=lambda args: DBManager(args.db).export()) p_export.set_defaults(func=lambda args: DBManager(args.db).export())
"""
Parsing
"""
p_parse = subparsers.add_parser("parse", help="parse help") p_parse = subparsers.add_parser("parse", help="parse help")
p_parse.add_argument("path", nargs="+", type=str)
p_parse.add_argument("--bank", nargs=1, type=str)
p_parse.set_defaults(func=parse)
# p_restart = subparsers.add_parser("restart", help="restart help") # p_restart = subparsers.add_parser("restart", help="restart help")
p_vacation = subparsers.add_parser( p_vacation = subparsers.add_parser(
@ -91,7 +98,6 @@ def argparser():
p_graph_interval.add_argument("--end", type=str, nargs=1, help="graph end date") p_graph_interval.add_argument("--end", type=str, nargs=1, help="graph end date")
p_graph_interval.add_argument("--year", type=str, nargs=1, help="graph year") p_graph_interval.add_argument("--year", type=str, nargs=1, help="graph year")
p_parse.set_defaults(func=parse)
# p_restart.set_defaults(func=restart) # p_restart.set_defaults(func=restart)
p_vacation.set_defaults(func=vacation) p_vacation.set_defaults(func=vacation)
p_status.set_defaults(func=status) p_status.set_defaults(func=status)
@ -134,7 +140,7 @@ def restart(state, args):
raise PfBudgetNotInitialized(f"{Path(tools.STATE_FILE)} doesn't exist") raise PfBudgetNotInitialized(f"{Path(tools.STATE_FILE)} doesn't exist")
def parse(state, args): def parse(args):
"""Parser """Parser
Parses the contents of the raw directory into the data files, and Parses the contents of the raw directory into the data files, and
@ -144,11 +150,17 @@ def parse(state, args):
state (PFState): Internal state of the program state (PFState): Internal state of the program
args (dict): argparse variables args (dict): argparse variables
""" """
raw_dir = args.raw if hasattr(args, "raw") else None for path in args.path:
data_dir = args.data if hasattr(args, "data") else None if (dir := Path(path)).is_dir():
for file in dir.iterdir():
tools.parser(state, raw_dir, data_dir) parse_data(file, args.bank)
categorize(state, args) elif Path(path).is_file():
trs = parse_data(path, args.bank)
else:
raise FileNotFoundError
# tools.parser(state, raw_dir, data_dir)
# categorize(state, args)
print("\n".join([t.desc() for t in trs]))
def categorize(state, args): def categorize(state, args):

View File

@ -23,10 +23,16 @@ class Transaction:
arg = args[0] if len(args) == 1 else list(args) arg = args[0] if len(args) == 1 else list(args)
try: try:
if type(arg[0]) is date:
self.date = arg[0]
else:
self.date = date.fromisoformat(arg[0]) self.date = date.fromisoformat(arg[0])
self.description = " ".join(arg[1].split()) self.description = " ".join(arg[1].split())
self.bank = arg[2] self.bank = arg[2]
self.value = Decimal(arg[3]) if type(arg[3]) is float:
self.value = arg[3]
else:
self.value = Decimal(args[3])
self.category = arg[4] self.category = arg[4]
except IndexError: except IndexError:
pass pass

54
pfbudget/utils.py Normal file
View File

@ -0,0 +1,54 @@
from decimal import Decimal
from pathlib import Path
class WrongFilenameError(Exception):
pass
class BankNotAvailableError(Exception):
pass
class CreditCardNotAvailableError(Exception):
pass
def parse_decimal(s: str) -> Decimal:
try:
float(s)
return Decimal(s)
except ValueError:
pass
s = s.strip().replace(u"\xa0", "").replace(" ", "")
s = s.strip().replace("", "").replace("+", "")
if s.rfind(",") > s.rfind("."):
s = s.replace(".", "")
i = s.rfind(",")
li = list(s)
li[i] = "."
s = "".join(li)
return Decimal(s.replace(",", ""))
def find_credit_institution(fn, banks, creditcards):
name = Path(fn).stem.split("_")
bank, cc = None, None
for i in name:
try:
int(i)
except ValueError:
if not bank:
bank = i
else:
cc = i
if not bank:
raise WrongFilenameError
if bank not in banks:
raise BankNotAvailableError
if cc and cc not in creditcards:
raise CreditCardNotAvailableError
return bank, cc

View File

@ -1 +1,2 @@
matplotlib==3.3.4 matplotlib==3.3.4
PyYAML==5.4.1