Select Git revision
org_users.py
-
Martin Mareš authoredMartin Mareš authored
csv.py 3.89 KiB
# Import a export CSV
# Jak vypadá CSV vypadlé z novějších verzí Excelu:
# - je v UTF-8 a má na začátku BOM
# - řádky ukončuje CR+LF
# - separátor je čárka
# - quoting pomocí uvozovek
import csv
import difflib
from enum import auto
from dataclasses import dataclass, fields
from typing import Type, List, IO, Sequence
import mo.db as db
class FileFormat(db.MOEnum):
en_csv = auto()
cs_csv = auto()
tsv = auto()
def friendly_name(self) -> str:
return format_desc[self].friendly_name
def get_dialect(self) -> str:
return format_desc[self].csv_dialect
def get_charset(self) -> str:
return format_desc[self].charset
def get_content_type(self) -> str:
return format_desc[self].content_type
def get_extension(self) -> str:
return format_desc[self].extension
@dataclass
class FileFormatDesc:
friendly_name: str
csv_dialect: str
charset: str
content_type: str
extension: str
format_desc = {
FileFormat.en_csv: FileFormatDesc(
friendly_name='sloupce oddělené čárkami v UTF-8 (anglický Excel)',
csv_dialect='excel',
charset='utf-8',
content_type='text/csv; charset=utf-8',
extension='csv',
),
FileFormat.cs_csv: FileFormatDesc(
friendly_name='sloupce oddělené středníky ve windows-1250 (český Excel)',
csv_dialect='cs-excel',
charset='windows-1250',
content_type='text/plain; charset=windows-1250',
extension='csv',
),
FileFormat.tsv: FileFormatDesc(
friendly_name='sloupce oddělené tabulátory v UTF-8',
csv_dialect='tsv',
charset='utf-8',
content_type='text/tab-separated-values; charset=utf-8',
extension='tsv',
),
}
class TSV(csv.Dialect):
delimiter = '\t'
doublequote = True
lineterminator = '\n'
quotechar = '"'
quoting = csv.QUOTE_MINIMAL
class CzechExcel(csv.excel):
delimiter = ';'
csv.register_dialect('tsv', TSV)
csv.register_dialect('cs-excel', CzechExcel)
class Row:
"""Řádek tabulky je dataclass, která je potomkem této třídy. Atributy
odpovídají sloupečkům, musí být typu str a mít defaultní hodnotu."""
class MissingHeaderError(RuntimeError):
pass
def write(file: IO, fmt: FileFormat, row_class: Type[Row], rows: Sequence[Row]):
writer = csv.writer(file, dialect=fmt.get_dialect())
columns = [field.name for field in fields(row_class)]
writer.writerow(columns)
for row in rows:
r = [getattr(row, c) for c in columns]
writer.writerow(r)
def read(file: IO, fmt: FileFormat, row_class: Type[Row]):
reader = csv.reader(file, dialect=fmt.get_dialect(), strict=True)
warnings = []
header: List[str] = []
rows: List[Row] = []
columns = set(field.name for field in fields(row_class))
for r in reader:
if reader.line_num == 1:
if r[0].startswith('\ufeff'):
# Excel's BOM
r[0] = r[0][1:]
header = r
if not any(h in columns for h in header):
raise MissingHeaderError()
for h in header:
if not h in columns:
best_matches = difflib.get_close_matches(h, columns, n=1, cutoff=0.8)
if best_matches:
warnings.append(
"Neznámý sloupec '{}', měli jste na mysli '{}'?".format(
h, best_matches[0]))
else:
row = row_class()
not_empty = False
for i in range(min(len(r), len(header))):
f = header[i]
x = r[i].strip()
if x != "" and x != '-':
not_empty = True
if f in columns:
setattr(row, f, x)
if not_empty:
rows.append(row)
return (rows, warnings)