chore: remove unnecessary csv file type

Signed-off-by: EricXiao <taoiaox@gmail.com>
2025-11-17 14:41:55 +08:00 · 2025-11-17 14:41:55 +08:00 · 983bfae4fc
commit 983bfae4fc
parent 01f9dd957c
1 changed files with 0 additions and 181 deletions
--- a/cognee/infrastructure/files/utils/is_csv_content.py
+++ b/cognee/infrastructure/files/utils/is_csv_content.py
@ -1,181 +0,0 @@
-import csv
-from collections import Counter
-
-
-def is_csv_content(content):
-    """
-    Heuristically determine whether a bytes-like object is CSV text.
-
-    Strategy (fail-fast and cheap to expensive):
-      1) Decode: Try a small ordered list of common encodings with strict errors.
-      2) Line sampling: require >= 2 non-empty lines; sample up to 50 lines.
-      3) Delimiter detection:
-         - Prefer csv.Sniffer() with common delimiters.
-         - Fallback to a lightweight consistency heuristic.
-      4) Lightweight parse check:
-         - Parse a few lines with the delimiter.
-         - Ensure at least 2 valid rows and relatively stable column counts.
-
-    Returns:
-        bool: True if the buffer looks like CSV; False otherwise.
-    """
-    try:
-        encoding_list = [
-            "utf-8",
-            "utf-8-sig",
-            "utf-32-le",
-            "utf-32-be",
-            "utf-16-le",
-            "utf-16-be",
-            "gb18030",
-            "shift_jis",
-            "cp949",
-            "cp1252",
-            "iso-8859-1",
-        ]
-
-        # Try to decode strictly—if decoding fails for all encodings, it's not text/CSV.
-        text = None
-        for enc in encoding_list:
-            try:
-                text = content.decode(enc, errors="strict")
-                break
-            except UnicodeDecodeError:
-                continue
-        if text is None:
-            return False
-
-        # Reject empty/whitespace-only payloads.
-        stripped = text.strip()
-        if not stripped:
-            return False
-
-        # Split into logical lines and drop empty ones. Require at least two lines.
-        lines = [ln for ln in text.splitlines() if ln.strip()]
-        if len(lines) < 2:
-            return False
-
-        # Take a small sample to keep sniffing cheap and predictable.
-        sample_lines = lines[:50]
-
-        # Detect delimiter using csv.Sniffer first; if that fails, use our heuristic.
-        delimiter = _sniff_delimiter(sample_lines) or _heuristic_delimiter(sample_lines)
-        if not delimiter:
-            return False
-
-        # Finally, do a lightweight parse sanity check with the chosen delimiter.
-        return _lightweight_parse_check(sample_lines, delimiter)
-    except Exception:
-        return False
-
-
-def _sniff_delimiter(lines):
-    """
-    Try Python's built-in csv.Sniffer on a sample.
-
-    Args:
-        lines (list[str]): Sample lines (already decoded).
-
-    Returns:
-        str | None: The detected delimiter if sniffing succeeds; otherwise None.
-    """
-    # Join up to 50 lines to form the sample string Sniffer will inspect.
-    sample = "\n".join(lines[:50])
-    try:
-        dialect = csv.Sniffer().sniff(sample, delimiters=",\t;|")
-        return dialect.delimiter
-    except Exception:
-        # Sniffer is known to be brittle on small/dirty samples—silently fallback.
-        return None
-
-
-def _heuristic_delimiter(lines):
-    """
-    Fallback delimiter detection based on count consistency per line.
-
-    Heuristic:
-      - For each candidate delimiter, count occurrences per line.
-      - Keep only lines with count > 0 (line must contain the delimiter).
-      - Require at least half of lines to contain the delimiter (min 2).
-      - Compute the mode (most common count). If the proportion of lines that
-        exhibit the modal count is >= 80%, accept that delimiter.
-
-    Args:
-        lines (list[str]): Sample lines.
-
-    Returns:
-        str | None: Best delimiter if one meets the consistency threshold; else None.
-    """
-    candidates = [",", "\t", ";", "|"]
-    best = None
-    best_score = 0.0
-
-    for d in candidates:
-        # Count how many times the delimiter appears in each line.
-        counts = [ln.count(d) for ln in lines]
-        # Consider only lines that actually contain the delimiter at least once.
-        nonzero = [c for c in counts if c > 0]
-
-        # Require that more than half of lines (and at least 2) contain the delimiter.
-        if len(nonzero) < max(2, int(0.5 * len(lines))):
-            continue
-
-        # Find the modal count and its frequency.
-        cnt = Counter(nonzero)
-        pairs = cnt.most_common(1)
-        if not pairs:
-            continue
-
-        mode, mode_freq = pairs[0]
-        # Consistency ratio: lines with the modal count / total lines in the sample.
-        consistency = mode_freq / len(lines)
-        # Accept if consistent enough and better than any previous candidate.
-        if mode >= 1 and consistency >= 0.80 and consistency > best_score:
-            best = d
-            best_score = consistency
-
-    return best
-
-
-def _lightweight_parse_check(lines, delimiter):
-    """
-    Parse a few lines with csv.reader and check structural stability.
-
-    Heuristic:
-      - Parse up to 5 lines with the given delimiter.
-      - Count column widths per parsed row.
-      - Require at least 2 non-empty rows.
-      - Allow at most 1 row whose width deviates by >2 columns from the first row.
-
-    Args:
-        lines (list[str]): Sample lines (decoded).
-        delimiter (str): Delimiter chosen by sniffing/heuristics.
-
-    Returns:
-        bool: True if parsing looks stable; False otherwise.
-    """
-    try:
-        # csv.reader accepts any iterable of strings; feeding the first 10 lines is fine.
-        reader = csv.reader(lines[:10], delimiter=delimiter)
-        widths = []
-        valid_rows = 0
-        for row in reader:
-            if not row:
-                continue
-
-            widths.append(len(row))
-            valid_rows += 1
-
-        # Need at least two meaningful rows to make a judgment.
-        if valid_rows < 2:
-            return False
-
-        if widths:
-            first = widths[0]
-            # Count rows whose width deviates significantly (>2) from the first row.
-            unstable = sum(1 for w in widths if abs(w - first) > 2)
-            # Permit at most 1 unstable row among the parsed sample.
-            return unstable <= 1
-        return False
-    except Exception:
-        return False