diff --git a/README.md b/README.md index 9f14d85..7e75a4b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ parsing -> categorize -> analyze (predict) ## Parsing -Parses bank extracts, based on parsers.py functions, to SQLite db. +Parses bank extracts, based on parsers.yaml, to a SQLite database. ## Categorize Categorizes transactions based on categories.py filters. diff --git a/parsers.yaml b/parsers.yaml new file mode 100644 index 0000000..512c960 --- /dev/null +++ b/parsers.yaml @@ -0,0 +1,60 @@ +Banks: + - Bank1 + - Bank2 + - Bank3 + +CreditCards: + - VISA + - MasterCard + - AmericanExpress + +default: &default + encoding: utf-8 + separator: "\t" + date_fmt: "%Y-%m-%d" + +Bank1: + <<: *default + separator: ";" + date_fmt: "%d/%m/%Y" + start: 6 + debit: + date: 1 + text: 3 + value: 4 + additional_parser: true + + +Bank2: &bank2 + <<: *default + date_fmt: "%d/%m/%Y" + debit: + date: 0 + text: 2 + value: 3 + VISA: + <<: *bank2 + debit: + date: 0 + text: 2 + value: 3 + credit: + date: 1 + text: 2 + value: 3 + +Bank3: + encoding: windows-1252 + separator: "," + date_fmt: "%d-%m-%Y" + start: 8 + end: -1 + debit: + date: 1 + text: 2 + value: 3 + negate: true + credit: + date: 0 + text: 2 + value: 4 diff --git a/pfbudget/parsers.py b/pfbudget/parsers.py index a7b08d7..2c4472a 100644 --- a/pfbudget/parsers.py +++ b/pfbudget/parsers.py @@ -1,198 +1,151 @@ -from datetime import datetime +from collections import namedtuple from decimal import Decimal -from pathlib import Path +from importlib import import_module +from typing import Final +import datetime as dt +import yaml from .transactions import Transaction +from . import utils -def parse_data(file: Path, append=False): - name = file.stem.split("_") - try: - bank, _ = name[0], int(name[1]) - except ValueError: - _, bank = int(name[0]), name[1] +cfg: Final = yaml.safe_load(open("parsers.yaml")) +assert ( + "Banks" in cfg +), "parsers.yaml is missing the Banks section with the list of available banks" - p = dict( - Bank1=Bank1, - Bank2=Bank2, - Bank2CC=Bank2CC, - BANK3=Bank3, - ) +Index = namedtuple( + "Index", ["date", "text", "value", "negate"], defaults=[-1, -1, -1, False] +) +Options = namedtuple( + "Options", + [ + "encoding", + "separator", + "date_fmt", + "start", + "end", + "debit", + "credit", + "additional_parser", + "VISA", + "MasterCard", + "AmericanExpress", + ], + defaults=["", "", "", 1, None, Index(), Index(), False, None, None, None], +) - try: - parser = p[bank]() - except KeyError as e: - print(f"{e} {bank} parser doesnt exist. Cant parse {name}") - return - transactions = parser.parse(file) +def parse_data(filename: str, bank=None) -> list: + if not bank: + bank, creditcard = utils.find_credit_institution( + filename, cfg.get("Banks"), cfg.get("CreditCards") + ) + + if creditcard: + options = cfg[bank][creditcard] + bank += creditcard + else: + options = cfg[bank] + + if options.get("additional_parser", False): + parser = getattr(import_module("pfbudget.parsers"), bank) + transactions = parser(filename, bank, options).parse() + else: + transactions = Parser(filename, bank, options).parse() + return transactions +def transaction(line: str, bank: str, options: Options, func) -> Transaction: + line = line.rstrip().split(options.separator) + index = Parser.index(line, options) + + date = ( + dt.datetime.strptime(line[index.date].strip(), options.date_fmt) + .date() + .isoformat() + ) + text = line[index.text] + value = utils.parse_decimal(line[index.value]) + if index.negate: + value = -value + transaction = Transaction(date, text, bank, value) + + if options.additional_parser: + func(transaction) + return transaction + + class Parser: - def parse(self, file): + def __init__(self, filename: str, bank: str, options: dict): + self.filename = filename + self.bank = bank + + if debit := options.get("debit", None): + options["debit"] = Index(**debit) + if credit := options.get("credit", None): + options["credit"] = Index(**credit) + + self.options = Options(**options) + + def func(self, transaction: Transaction): pass + def parse(self) -> list: + transactions = [ + transaction(line, self.bank, self.options, self.func) + for line in list(open(self.filename, encoding=self.options.encoding))[ + self.options.start - 1 : self.options.end + ] + ] + return transactions + + @staticmethod + def index(line: list, options: Options) -> Index: + if options.debit.date != -1 and options.credit.date != -1: + if options.debit.value != options.credit.value: + if line[options.debit.value]: + index = options.debit + elif line[options.credit.value]: + index = options.credit + elif options.debit.date != options.credit.date: + if line[options.debit.date]: + index = options.debit + elif line[options.credit.date]: + index = options.credit + elif options.debit.text != options.credit.text: + if line[options.debit.text]: + index = options.debit + elif line[options.credit.text]: + index = options.credit + else: + raise IndexError("Debit and credit indexes are equal") + elif options.debit.date != -1: + index = options.debit + elif options.credit.date != -1: + index = options.credit + else: + raise IndexError("No debit not credit indexes available") + + return index + class Bank1(Parser): - """Bank 1 parser + def __init__(self, filename: str, bank: str, options: dict): + super().__init__(filename, bank, options) + self.transfers = [] + self.transaction_cost = -Decimal("1") - Bank 1 transcripts have the following properties: - encoding: utf-8 - separator: ; - starting line: 5 - date format: %d/%m/%Y - - The reading order is reversed to go from earlier to latest. - """ - - encoding = "utf-8" - separator = ";" - - def parse(self, file): - transactions = [] - reader = [ - line.rstrip().split(self.separator) - for line in open(file, encoding=self.encoding) - ][5:] - - for transaction in reversed(reader): - transaction = [field.rstrip() for field in transaction] - date = datetime.strptime(transaction[1], "%d/%m/%Y").date() - description = " ".join(transaction[3].split()) - value = Decimal(transaction[4]) + def func(self, transaction: Transaction): + if "transf" in transaction.description.lower() and transaction.value < 0: + transaction.value -= self.transaction_cost + self.transfers.append(transaction.date) + def parse(self) -> list: + transactions = super().parse() + for date in self.transfers: transactions.append( - Transaction(date.isoformat(), description, "Bank1", value) + Transaction(date, "Transaction cost", self.bank, self.transaction_cost) ) - - return transactions - - -class Bank2(Parser): - """Bank 2 parser - - Bank 2 transcripts have the following properties: - encoding: utf-8 - separator: tab - date format: %d/%m/%Y or %d-%m-%Y - decimal separator: , - """ - - encoding = "utf-8" - separator = "\t" - - def parse(self, file): - transactions = [] - reader = [ - line.rstrip().split(self.separator) - for line in open(file, encoding=self.encoding) - ] - - for transaction in reader: - try: - date = datetime.strptime(transaction[0], "%d/%m/%Y").date() - except ValueError: # date can differ due to locales - date = datetime.strptime(transaction[0], "%d-%m-%Y").date() - description = transaction[2] - - # works for US and EU locales (5,000.00 and 5 000,00) - value = list(transaction[3].replace("\xa0", "")) # non-breaking space - value[-3] = "." - value = "".join(value) - value = value.replace(",", "") - value = Decimal(value) - - transactions.append( - Transaction(date.isoformat(), description, "Bank2", value) - ) - - return transactions - - -class Bank2CC(Parser): - """Bank 2 credit card parser - - Bank 2 credit card transcripts have the following properties: - encoding: utf-8 - separator: tab - date format: %d/%m/%Y or %d-%m-%Y - decimal separator: , - """ - - encoding = "utf-8" - separator = "\t" - - def parse(self, file): - transactions = [] - reader = [ - line.rstrip().split(self.separator) - for line in open(file, encoding=self.encoding) - ] - - for transaction in reader: - try: - date = datetime.strptime(transaction[0], "%d/%m/%Y").date() - except ValueError: # date can differ due to locales - date = datetime.strptime(transaction[0], "%d-%m-%Y").date() - description = transaction[2] - - # works for US and EU locales (5,000.00 and 5 000,00) - value = list(transaction[3].replace("\xa0", "")) # non-breaking space - value[-3] = "." - value = "".join(value) - value = value.replace(",", "") - value = Decimal(value) - - if value > 0: - date = datetime.strptime(transaction[1], "%d/%m/%Y").date() - - transactions.append( - Transaction(date.isoformat(), description, "Bank2CC", value) - ) - - return transactions - - -class Bank3(Parser): - """Bank 3 parser - - Bank 3 transcripts have the following properties: - encoding: windows-1252 (passed as argument) - separator: ; - starting line: 7 - finishing line: -1 - date format: %d-%m-%Y - decimal separator: , - thousands separator: . - - Bank 3 has credits in a different column from debits. These also have to be - negated. The reading order is reversed to go from earlier to latest. - """ - - encoding = "windows-1252" - separator = "," - - def parse(self, file): - transactions = [] - reader = [ - line.rstrip().split(self.separator) - for line in open(file, encoding=self.encoding) - ][7:-1] - - for transaction in reversed(reader): - transaction = [field.rstrip() for field in transaction] - date = datetime.strptime(transaction[1], "%d-%m-%Y").date() - description = transaction[2] - if t := transaction[3]: - t = t.replace(".", "").replace(",", ".") - value = -Decimal(t) - else: - t = transaction[4].replace(".", "").replace(",", ".") - value = Decimal(t) - - transactions.append( - Transaction(date.isoformat(), description, "Bank3", value) - ) - return transactions diff --git a/pfbudget/runnable.py b/pfbudget/runnable.py index 31d1ceb..9a5863e 100644 --- a/pfbudget/runnable.py +++ b/pfbudget/runnable.py @@ -4,6 +4,7 @@ import datetime as dt from .database import DBManager from .graph import discrete, monthly +from .parsers import parse_data from .transactions import load_transactions, save_transactions from . import report from . import tools @@ -45,7 +46,13 @@ def argparser(): p_export = subparsers.add_parser("export", help="export help") p_export.set_defaults(func=lambda args: DBManager(args.db).export()) + """ + Parsing + """ p_parse = subparsers.add_parser("parse", help="parse help") + p_parse.add_argument("path", nargs="+", type=str) + p_parse.add_argument("--bank", nargs=1, type=str) + p_parse.set_defaults(func=parse) # p_restart = subparsers.add_parser("restart", help="restart help") p_vacation = subparsers.add_parser( @@ -91,7 +98,6 @@ def argparser(): p_graph_interval.add_argument("--end", type=str, nargs=1, help="graph end date") p_graph_interval.add_argument("--year", type=str, nargs=1, help="graph year") - p_parse.set_defaults(func=parse) # p_restart.set_defaults(func=restart) p_vacation.set_defaults(func=vacation) p_status.set_defaults(func=status) @@ -134,7 +140,7 @@ def restart(state, args): raise PfBudgetNotInitialized(f"{Path(tools.STATE_FILE)} doesn't exist") -def parse(state, args): +def parse(args): """Parser Parses the contents of the raw directory into the data files, and @@ -144,11 +150,17 @@ def parse(state, args): state (PFState): Internal state of the program args (dict): argparse variables """ - raw_dir = args.raw if hasattr(args, "raw") else None - data_dir = args.data if hasattr(args, "data") else None - - tools.parser(state, raw_dir, data_dir) - categorize(state, args) + for path in args.path: + if (dir := Path(path)).is_dir(): + for file in dir.iterdir(): + parse_data(file, args.bank) + elif Path(path).is_file(): + trs = parse_data(path, args.bank) + else: + raise FileNotFoundError + # tools.parser(state, raw_dir, data_dir) + # categorize(state, args) + print("\n".join([t.desc() for t in trs])) def categorize(state, args): diff --git a/pfbudget/transactions.py b/pfbudget/transactions.py index 668194c..968323c 100644 --- a/pfbudget/transactions.py +++ b/pfbudget/transactions.py @@ -23,10 +23,16 @@ class Transaction: arg = args[0] if len(args) == 1 else list(args) try: - self.date = date.fromisoformat(arg[0]) + if type(arg[0]) is date: + self.date = arg[0] + else: + self.date = date.fromisoformat(arg[0]) self.description = " ".join(arg[1].split()) self.bank = arg[2] - self.value = Decimal(arg[3]) + if type(arg[3]) is float: + self.value = arg[3] + else: + self.value = Decimal(args[3]) self.category = arg[4] except IndexError: pass diff --git a/pfbudget/utils.py b/pfbudget/utils.py new file mode 100644 index 0000000..0df2f1c --- /dev/null +++ b/pfbudget/utils.py @@ -0,0 +1,54 @@ +from decimal import Decimal +from pathlib import Path + + +class WrongFilenameError(Exception): + pass + + +class BankNotAvailableError(Exception): + pass + + +class CreditCardNotAvailableError(Exception): + pass + + +def parse_decimal(s: str) -> Decimal: + try: + float(s) + return Decimal(s) + except ValueError: + pass + s = s.strip().replace(u"\xa0", "").replace(" ", "") + s = s.strip().replace("€", "").replace("+", "") + if s.rfind(",") > s.rfind("."): + s = s.replace(".", "") + i = s.rfind(",") + li = list(s) + li[i] = "." + s = "".join(li) + return Decimal(s.replace(",", "")) + + +def find_credit_institution(fn, banks, creditcards): + name = Path(fn).stem.split("_") + bank, cc = None, None + for i in name: + try: + int(i) + except ValueError: + if not bank: + bank = i + else: + cc = i + + if not bank: + raise WrongFilenameError + + if bank not in banks: + raise BankNotAvailableError + if cc and cc not in creditcards: + raise CreditCardNotAvailableError + + return bank, cc diff --git a/requirements.txt b/requirements.txt index e3a623c..b4fabe4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ matplotlib==3.3.4 +PyYAML==5.4.1