Avoid mis-identifying certain PDF files as UTF-8 text

Closes #110.

Avoid mis-identifying certain PDF files as UTF-8 text
f0bcaca5 · Martin Mareš · 4350dace · f0bcaca5
Commit f0bcaca5 authored 9 months ago by Martin Mareš
--- a/owl/post.py
+++ b/owl/post.py
@@ -96,12 +96,17 @@ def is_utf8(f) -> bool:
    f.seek(0)
    x = io.TextIOWrapper(f, encoding='utf-8', errors='strict')
    try:
-        x.read(4096)
+        head = x.read(4096)
+        if head.startswith('%PDF-'):
+            # PDF files are expected to contain non-ASCII bytes at their beginning,
+            # but surprisingly enough, these bytes sometimes decode as UTF-8.
+            verdict = False
+        else:
+            verdict = True
    except UnicodeDecodeError:
+        verdict = False
    x.detach()
-        return False
+    return verdict
-    x.detach()
-    return True
 # Presentation of score