Skip to content
Snippets Groups Projects
Select Git revision
  • 19cc463e5943881e16e31d79d5b4a83012b3ab51
  • devel default
  • master
  • fo
  • jirka/typing
  • fo-base
  • mj/submit-images
  • jk/issue-96
  • jk/issue-196
  • honza/add-contestant
  • honza/mr7
  • honza/mrf
  • honza/mrd
  • honza/mra
  • honza/mr6
  • honza/submit-images
  • honza/kolo-vs-soutez
  • jh-stress-test-wip
  • shorten-schools
19 results

csv.py

Blame
  • csv.py 3.89 KiB
    # Import a export CSV
    
    # Jak vypadá CSV vypadlé z novějších verzí Excelu:
    #   - je v UTF-8 a má na začátku BOM
    #   - řádky ukončuje CR+LF
    #   - separátor je čárka
    #   - quoting pomocí uvozovek
    
    import csv
    import difflib
    from enum import auto
    from dataclasses import dataclass, fields
    from typing import Type, List, IO, Sequence
    
    import mo.db as db
    
    
    class FileFormat(db.MOEnum):
        en_csv = auto()
        cs_csv = auto()
        tsv = auto()
    
        def friendly_name(self) -> str:
            return format_desc[self].friendly_name
    
        def get_dialect(self) -> str:
            return format_desc[self].csv_dialect
    
        def get_charset(self) -> str:
            return format_desc[self].charset
    
        def get_content_type(self) -> str:
            return format_desc[self].content_type
    
        def get_extension(self) -> str:
            return format_desc[self].extension
    
    
    @dataclass
    class FileFormatDesc:
        friendly_name: str
        csv_dialect: str
        charset: str
        content_type: str
        extension: str
    
    
    format_desc = {
        FileFormat.en_csv: FileFormatDesc(
            friendly_name='sloupce oddělené čárkami v UTF-8 (anglický Excel)',
            csv_dialect='excel',
            charset='utf-8',
            content_type='text/csv; charset=utf-8',
            extension='csv',
        ),
        FileFormat.cs_csv: FileFormatDesc(
            friendly_name='sloupce oddělené středníky ve windows-1250 (český Excel)',
            csv_dialect='cs-excel',
            charset='windows-1250',
            content_type='text/plain; charset=windows-1250',
            extension='csv',
        ),
        FileFormat.tsv: FileFormatDesc(
            friendly_name='sloupce oddělené tabulátory v UTF-8',
            csv_dialect='tsv',
            charset='utf-8',
            content_type='text/tab-separated-values; charset=utf-8',
            extension='tsv',
        ),
    }
    
    
    class TSV(csv.Dialect):
        delimiter = '\t'
        doublequote = True
        lineterminator = '\n'
        quotechar = '"'
        quoting = csv.QUOTE_MINIMAL
    
    
    class CzechExcel(csv.excel):
        delimiter = ';'
    
    
    csv.register_dialect('tsv', TSV)
    csv.register_dialect('cs-excel', CzechExcel)
    
    
    class Row:
        """Řádek tabulky je dataclass, která je potomkem této třídy. Atributy
        odpovídají sloupečkům, musí být typu str a mít defaultní hodnotu."""
    
    
    class MissingHeaderError(RuntimeError):
        pass
    
    
    def write(file: IO, fmt: FileFormat, row_class: Type[Row], rows: Sequence[Row]):
        writer = csv.writer(file, dialect=fmt.get_dialect())
    
        columns = [field.name for field in fields(row_class)]
        writer.writerow(columns)
    
        for row in rows:
            r = [getattr(row, c) for c in columns]
            writer.writerow(r)
    
    
    def read(file: IO, fmt: FileFormat, row_class: Type[Row]):
        reader = csv.reader(file, dialect=fmt.get_dialect(), strict=True)
    
        warnings = []
        header: List[str] = []
        rows: List[Row] = []
        columns = set(field.name for field in fields(row_class))
    
        for r in reader:
            if reader.line_num == 1:
                if r[0].startswith('\ufeff'):
                    # Excel's BOM
                    r[0] = r[0][1:]
                header = r
                if not any(h in columns for h in header):
                    raise MissingHeaderError()
                for h in header:
                    if not h in columns:
                        best_matches = difflib.get_close_matches(h, columns, n=1, cutoff=0.8)
                        if best_matches:
                            warnings.append(
                                "Neznámý sloupec '{}', měli jste na mysli '{}'?".format(
                                    h, best_matches[0]))
            else:
                row = row_class()
                not_empty = False
                for i in range(min(len(r), len(header))):
                    f = header[i]
                    x = r[i].strip()
                    if x != "" and x != '-':
                        not_empty = True
                        if f in columns:
                            setattr(row, f, x)
                if not_empty:
                    rows.append(row)
    
        return (rows, warnings)