Parsers configuration now on parsers.yaml

parsers.py redid with single default parser that takes configurating
parameters from a yaml file.
Additional parsing configuration can be achieved with the additional_parser
attribute on yaml and rewriting the func and parser method on child
classes of Parser.
func will be called after each transaction is created and the parser
should call the parent parser method or rewrite the entire parser
process.
The parse_data function is now called from the runnable and the parsing
process is now called from there. The parse command can take an optional
bank before is tries to extract it from the filename and multiple paths,
either files or directories.
The Transaction __init__ was fixed to take inputs from previously
initiated Transaction.
Also adds utils.py with helper functions.
This commit is contained in:
Luís Murta 2021-06-03 23:30:32 +01:00
parent 4d9e8edec8
commit 4d6c865bb5
Signed by: satprog
GPG Key ID: DDF2EFC6179009DC
7 changed files with 273 additions and 187 deletions

View File

@ -3,7 +3,7 @@
parsing -> categorize -> analyze (predict)
## Parsing
Parses bank extracts, based on parsers.py functions, to SQLite db.
Parses bank extracts, based on parsers.yaml, to a SQLite database.
## Categorize
Categorizes transactions based on categories.py filters.

60
parsers.yaml Normal file
View File

@ -0,0 +1,60 @@
Banks:
- Bank1
- Bank2
- Bank3
CreditCards:
- VISA
- MasterCard
- AmericanExpress
default: &default
encoding: utf-8
separator: "\t"
date_fmt: "%Y-%m-%d"
Bank1:
<<: *default
separator: ";"
date_fmt: "%d/%m/%Y"
start: 6
debit:
date: 1
text: 3
value: 4
additional_parser: true
Bank2: &bank2
<<: *default
date_fmt: "%d/%m/%Y"
debit:
date: 0
text: 2
value: 3
VISA:
<<: *bank2
debit:
date: 0
text: 2
value: 3
credit:
date: 1
text: 2
value: 3
Bank3:
encoding: windows-1252
separator: ","
date_fmt: "%d-%m-%Y"
start: 8
end: -1
debit:
date: 1
text: 2
value: 3
negate: true
credit:
date: 0
text: 2
value: 4

View File

@ -1,198 +1,151 @@
from datetime import datetime
from collections import namedtuple
from decimal import Decimal
from pathlib import Path
from importlib import import_module
from typing import Final
import datetime as dt
import yaml
from .transactions import Transaction
from . import utils
def parse_data(file: Path, append=False):
name = file.stem.split("_")
try:
bank, _ = name[0], int(name[1])
except ValueError:
_, bank = int(name[0]), name[1]
cfg: Final = yaml.safe_load(open("parsers.yaml"))
assert (
"Banks" in cfg
), "parsers.yaml is missing the Banks section with the list of available banks"
p = dict(
Bank1=Bank1,
Bank2=Bank2,
Bank2CC=Bank2CC,
BANK3=Bank3,
)
Index = namedtuple(
"Index", ["date", "text", "value", "negate"], defaults=[-1, -1, -1, False]
)
Options = namedtuple(
"Options",
[
"encoding",
"separator",
"date_fmt",
"start",
"end",
"debit",
"credit",
"additional_parser",
"VISA",
"MasterCard",
"AmericanExpress",
],
defaults=["", "", "", 1, None, Index(), Index(), False, None, None, None],
)
try:
parser = p[bank]()
except KeyError as e:
print(f"{e} {bank} parser doesnt exist. Cant parse {name}")
return
transactions = parser.parse(file)
def parse_data(filename: str, bank=None) -> list:
if not bank:
bank, creditcard = utils.find_credit_institution(
filename, cfg.get("Banks"), cfg.get("CreditCards")
)
if creditcard:
options = cfg[bank][creditcard]
bank += creditcard
else:
options = cfg[bank]
if options.get("additional_parser", False):
parser = getattr(import_module("pfbudget.parsers"), bank)
transactions = parser(filename, bank, options).parse()
else:
transactions = Parser(filename, bank, options).parse()
return transactions
def transaction(line: str, bank: str, options: Options, func) -> Transaction:
line = line.rstrip().split(options.separator)
index = Parser.index(line, options)
date = (
dt.datetime.strptime(line[index.date].strip(), options.date_fmt)
.date()
.isoformat()
)
text = line[index.text]
value = utils.parse_decimal(line[index.value])
if index.negate:
value = -value
transaction = Transaction(date, text, bank, value)
if options.additional_parser:
func(transaction)
return transaction
class Parser:
def parse(self, file):
def __init__(self, filename: str, bank: str, options: dict):
self.filename = filename
self.bank = bank
if debit := options.get("debit", None):
options["debit"] = Index(**debit)
if credit := options.get("credit", None):
options["credit"] = Index(**credit)
self.options = Options(**options)
def func(self, transaction: Transaction):
pass
def parse(self) -> list:
transactions = [
transaction(line, self.bank, self.options, self.func)
for line in list(open(self.filename, encoding=self.options.encoding))[
self.options.start - 1 : self.options.end
]
]
return transactions
@staticmethod
def index(line: list, options: Options) -> Index:
if options.debit.date != -1 and options.credit.date != -1:
if options.debit.value != options.credit.value:
if line[options.debit.value]:
index = options.debit
elif line[options.credit.value]:
index = options.credit
elif options.debit.date != options.credit.date:
if line[options.debit.date]:
index = options.debit
elif line[options.credit.date]:
index = options.credit
elif options.debit.text != options.credit.text:
if line[options.debit.text]:
index = options.debit
elif line[options.credit.text]:
index = options.credit
else:
raise IndexError("Debit and credit indexes are equal")
elif options.debit.date != -1:
index = options.debit
elif options.credit.date != -1:
index = options.credit
else:
raise IndexError("No debit not credit indexes available")
return index
class Bank1(Parser):
"""Bank 1 parser
def __init__(self, filename: str, bank: str, options: dict):
super().__init__(filename, bank, options)
self.transfers = []
self.transaction_cost = -Decimal("1")
Bank 1 transcripts have the following properties:
encoding: utf-8
separator: ;
starting line: 5
date format: %d/%m/%Y
The reading order is reversed to go from earlier to latest.
"""
encoding = "utf-8"
separator = ";"
def parse(self, file):
transactions = []
reader = [
line.rstrip().split(self.separator)
for line in open(file, encoding=self.encoding)
][5:]
for transaction in reversed(reader):
transaction = [field.rstrip() for field in transaction]
date = datetime.strptime(transaction[1], "%d/%m/%Y").date()
description = " ".join(transaction[3].split())
value = Decimal(transaction[4])
def func(self, transaction: Transaction):
if "transf" in transaction.description.lower() and transaction.value < 0:
transaction.value -= self.transaction_cost
self.transfers.append(transaction.date)
def parse(self) -> list:
transactions = super().parse()
for date in self.transfers:
transactions.append(
Transaction(date.isoformat(), description, "Bank1", value)
Transaction(date, "Transaction cost", self.bank, self.transaction_cost)
)
return transactions
class Bank2(Parser):
"""Bank 2 parser
Bank 2 transcripts have the following properties:
encoding: utf-8
separator: tab
date format: %d/%m/%Y or %d-%m-%Y
decimal separator: ,
"""
encoding = "utf-8"
separator = "\t"
def parse(self, file):
transactions = []
reader = [
line.rstrip().split(self.separator)
for line in open(file, encoding=self.encoding)
]
for transaction in reader:
try:
date = datetime.strptime(transaction[0], "%d/%m/%Y").date()
except ValueError: # date can differ due to locales
date = datetime.strptime(transaction[0], "%d-%m-%Y").date()
description = transaction[2]
# works for US and EU locales (5,000.00 and 5 000,00)
value = list(transaction[3].replace("\xa0", "")) # non-breaking space
value[-3] = "."
value = "".join(value)
value = value.replace(",", "")
value = Decimal(value)
transactions.append(
Transaction(date.isoformat(), description, "Bank2", value)
)
return transactions
class Bank2CC(Parser):
"""Bank 2 credit card parser
Bank 2 credit card transcripts have the following properties:
encoding: utf-8
separator: tab
date format: %d/%m/%Y or %d-%m-%Y
decimal separator: ,
"""
encoding = "utf-8"
separator = "\t"
def parse(self, file):
transactions = []
reader = [
line.rstrip().split(self.separator)
for line in open(file, encoding=self.encoding)
]
for transaction in reader:
try:
date = datetime.strptime(transaction[0], "%d/%m/%Y").date()
except ValueError: # date can differ due to locales
date = datetime.strptime(transaction[0], "%d-%m-%Y").date()
description = transaction[2]
# works for US and EU locales (5,000.00 and 5 000,00)
value = list(transaction[3].replace("\xa0", "")) # non-breaking space
value[-3] = "."
value = "".join(value)
value = value.replace(",", "")
value = Decimal(value)
if value > 0:
date = datetime.strptime(transaction[1], "%d/%m/%Y").date()
transactions.append(
Transaction(date.isoformat(), description, "Bank2CC", value)
)
return transactions
class Bank3(Parser):
"""Bank 3 parser
Bank 3 transcripts have the following properties:
encoding: windows-1252 (passed as argument)
separator: ;
starting line: 7
finishing line: -1
date format: %d-%m-%Y
decimal separator: ,
thousands separator: .
Bank 3 has credits in a different column from debits. These also have to be
negated. The reading order is reversed to go from earlier to latest.
"""
encoding = "windows-1252"
separator = ","
def parse(self, file):
transactions = []
reader = [
line.rstrip().split(self.separator)
for line in open(file, encoding=self.encoding)
][7:-1]
for transaction in reversed(reader):
transaction = [field.rstrip() for field in transaction]
date = datetime.strptime(transaction[1], "%d-%m-%Y").date()
description = transaction[2]
if t := transaction[3]:
t = t.replace(".", "").replace(",", ".")
value = -Decimal(t)
else:
t = transaction[4].replace(".", "").replace(",", ".")
value = Decimal(t)
transactions.append(
Transaction(date.isoformat(), description, "Bank3", value)
)
return transactions

View File

@ -4,6 +4,7 @@ import datetime as dt
from .database import DBManager
from .graph import discrete, monthly
from .parsers import parse_data
from .transactions import load_transactions, save_transactions
from . import report
from . import tools
@ -45,7 +46,13 @@ def argparser():
p_export = subparsers.add_parser("export", help="export help")
p_export.set_defaults(func=lambda args: DBManager(args.db).export())
"""
Parsing
"""
p_parse = subparsers.add_parser("parse", help="parse help")
p_parse.add_argument("path", nargs="+", type=str)
p_parse.add_argument("--bank", nargs=1, type=str)
p_parse.set_defaults(func=parse)
# p_restart = subparsers.add_parser("restart", help="restart help")
p_vacation = subparsers.add_parser(
@ -91,7 +98,6 @@ def argparser():
p_graph_interval.add_argument("--end", type=str, nargs=1, help="graph end date")
p_graph_interval.add_argument("--year", type=str, nargs=1, help="graph year")
p_parse.set_defaults(func=parse)
# p_restart.set_defaults(func=restart)
p_vacation.set_defaults(func=vacation)
p_status.set_defaults(func=status)
@ -134,7 +140,7 @@ def restart(state, args):
raise PfBudgetNotInitialized(f"{Path(tools.STATE_FILE)} doesn't exist")
def parse(state, args):
def parse(args):
"""Parser
Parses the contents of the raw directory into the data files, and
@ -144,11 +150,17 @@ def parse(state, args):
state (PFState): Internal state of the program
args (dict): argparse variables
"""
raw_dir = args.raw if hasattr(args, "raw") else None
data_dir = args.data if hasattr(args, "data") else None
tools.parser(state, raw_dir, data_dir)
categorize(state, args)
for path in args.path:
if (dir := Path(path)).is_dir():
for file in dir.iterdir():
parse_data(file, args.bank)
elif Path(path).is_file():
trs = parse_data(path, args.bank)
else:
raise FileNotFoundError
# tools.parser(state, raw_dir, data_dir)
# categorize(state, args)
print("\n".join([t.desc() for t in trs]))
def categorize(state, args):

View File

@ -23,10 +23,16 @@ class Transaction:
arg = args[0] if len(args) == 1 else list(args)
try:
self.date = date.fromisoformat(arg[0])
if type(arg[0]) is date:
self.date = arg[0]
else:
self.date = date.fromisoformat(arg[0])
self.description = " ".join(arg[1].split())
self.bank = arg[2]
self.value = Decimal(arg[3])
if type(arg[3]) is float:
self.value = arg[3]
else:
self.value = Decimal(args[3])
self.category = arg[4]
except IndexError:
pass

54
pfbudget/utils.py Normal file
View File

@ -0,0 +1,54 @@
from decimal import Decimal
from pathlib import Path
class WrongFilenameError(Exception):
pass
class BankNotAvailableError(Exception):
pass
class CreditCardNotAvailableError(Exception):
pass
def parse_decimal(s: str) -> Decimal:
try:
float(s)
return Decimal(s)
except ValueError:
pass
s = s.strip().replace(u"\xa0", "").replace(" ", "")
s = s.strip().replace("", "").replace("+", "")
if s.rfind(",") > s.rfind("."):
s = s.replace(".", "")
i = s.rfind(",")
li = list(s)
li[i] = "."
s = "".join(li)
return Decimal(s.replace(",", ""))
def find_credit_institution(fn, banks, creditcards):
name = Path(fn).stem.split("_")
bank, cc = None, None
for i in name:
try:
int(i)
except ValueError:
if not bank:
bank = i
else:
cc = i
if not bank:
raise WrongFilenameError
if bank not in banks:
raise BankNotAvailableError
if cc and cc not in creditcards:
raise CreditCardNotAvailableError
return bank, cc

View File

@ -1 +1,2 @@
matplotlib==3.3.4
PyYAML==5.4.1