Source code for knowledgespaces.io.csv

"""
CSV import/export for KST objects.

Supports three standard CSV formats:
- **Skill map matrix**: rows=items, cols=skills, binary (μ: items→skills).
- **Prerequisite matrix**: rows=labels, cols=labels, binary (surmise relation).
- **Knowledge structure**: state_size, state_id, then binary columns per item.

All CSV files use the first column as row index and the first row as header.
"""

from __future__ import annotations

import csv
from pathlib import Path
from typing import Union

from knowledgespaces.derivation.skill_map import SkillMap
from knowledgespaces.structures.knowledge_structure import KnowledgeStructure
from knowledgespaces.structures.relations import SurmiseRelation

PathLike = Union[str, Path]


def _validate_unique(labels: list[str], context: str) -> None:
    """Validate that all labels in a list are unique."""
    seen: set[str] = set()
    for label in labels:
        if label in seen:
            raise ValueError(f"{context}: duplicate label '{label}'.")
        seen.add(label)


def _validate_binary_row(
    row: list[str],
    expected_cols: int,
    row_label: str,
    context: str,
) -> None:
    """Validate that a CSV data row has the right width and only 0/1 values."""
    actual = len(row) - 1  # exclude the row-label column
    if actual != expected_cols:
        raise ValueError(
            f"{context}: row '{row_label}' has {actual} data columns, expected {expected_cols}."
        )
    for j, val in enumerate(row[1:]):
        if val not in ("0", "1"):
            raise ValueError(
                f"{context}: row '{row_label}', column {j}: value {val!r} is not '0' or '1'."
            )


# ------------------------------------------------------------------
# Skill map
# ------------------------------------------------------------------


[docs] def read_skill_map(path: PathLike) -> SkillMap: """Read a skill map from CSV. Expected format:: ,skill1,skill2,... item1,0,1,... item2,1,0,... Raises ------ ValueError If rows have wrong column count or non-binary values. """ with open(path, newline="", encoding="utf-8") as f: reader = csv.reader(f) header = next(reader) skills = header[1:] if not skills: raise ValueError("Skill map CSV: header has no skill columns.") _validate_unique(skills, "Skill map CSV header") n_skills = len(skills) items: list[str] = [] mapping: dict[str, frozenset[str]] = {} for line_no, row in enumerate(reader, start=2): if not row: continue item = row[0] _validate_binary_row(row, n_skills, item, f"Skill map CSV line {line_no}") items.append(item) required = frozenset(skills[j] for j, val in enumerate(row[1:]) if val == "1") mapping[item] = required if not items: raise ValueError("Skill map CSV: no data rows.") _validate_unique(items, "Skill map CSV row labels") return SkillMap(items, skills, mapping)
[docs] def write_skill_map(skill_map: SkillMap, path: PathLike) -> None: """Write a skill map to CSV.""" items, skills, matrix = skill_map.to_matrix() with open(path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["", *skills]) for i, item in enumerate(items): writer.writerow([item, *(str(v) for v in matrix[i])])
# ------------------------------------------------------------------ # Surmise relation (prerequisite matrix) # ------------------------------------------------------------------
[docs] def read_relation(path: PathLike) -> SurmiseRelation: """Read a surmise relation from a CSV prerequisite matrix. Expected format:: ,label1,label2,... label1,0,1,... label2,0,0,... Row labels must match column labels exactly. Raises ------ ValueError If rows have wrong column count, non-binary values, or row labels don't match header labels. """ with open(path, newline="", encoding="utf-8") as f: reader = csv.reader(f) header = next(reader) labels = header[1:] if not labels: raise ValueError("Prerequisite CSV: header has no label columns.") _validate_unique(labels, "Prerequisite CSV header") n_labels = len(labels) label_set = set(labels) row_labels: list[str] = [] relations: set[tuple[str, str]] = set() for line_no, row in enumerate(reader, start=2): if not row: continue row_label = row[0] if row_label not in label_set: raise ValueError( f"Prerequisite CSV line {line_no}: row label " f"'{row_label}' is not in the header labels {sorted(label_set)}." ) _validate_binary_row(row, n_labels, row_label, f"Prerequisite CSV line {line_no}") if row_label in row_labels: raise ValueError( f"Prerequisite CSV line {line_no}: duplicate row label '{row_label}'." ) row_labels.append(row_label) for j, val in enumerate(row[1:]): if val == "1": relations.add((row_label, labels[j])) if set(row_labels) != label_set: missing = label_set - set(row_labels) raise ValueError(f"Prerequisite CSV: missing rows for labels: {sorted(missing)}.") return SurmiseRelation(labels, relations)
[docs] def write_relation(relation: SurmiseRelation, path: PathLike) -> None: """Write a surmise relation to a CSV prerequisite matrix.""" items, matrix = relation.to_matrix() with open(path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["", *items]) for i, item in enumerate(items): writer.writerow([item, *(str(v) for v in matrix[i])])
# ------------------------------------------------------------------ # Knowledge structure (state matrix) # ------------------------------------------------------------------
[docs] def read_structure(path: PathLike) -> KnowledgeStructure: """Read a knowledge structure from CSV. Expected format:: state_size,state_id,item1,item2,... 0,0,0,0,... 1,1,1,0,... Raises ------ ValueError If rows have wrong column count or non-binary item values. """ with open(path, newline="", encoding="utf-8") as f: reader = csv.reader(f) header = next(reader) if len(header) < 3: raise ValueError( "Structure CSV: header must have at least " "state_size, state_id, and one item column." ) items = header[2:] _validate_unique(items, "Structure CSV header items") n_items = len(items) states: list[frozenset[str]] = [] for line_no, row in enumerate(reader, start=2): if not row: continue actual_data = len(row) - 2 # skip state_size, state_id if actual_data != n_items: raise ValueError( f"Structure CSV line {line_no}: {actual_data} item columns, expected {n_items}." ) for j, val in enumerate(row[2:]): if val not in ("0", "1"): raise ValueError( f"Structure CSV line {line_no}, item '{items[j]}': " f"value {val!r} is not '0' or '1'." ) state = frozenset(items[j] for j, val in enumerate(row[2:]) if val == "1") states.append(state) return KnowledgeStructure(items, states)
[docs] def write_structure(structure: KnowledgeStructure, path: PathLike) -> None: """Write a knowledge structure to CSV.""" items = sorted(structure.domain) sorted_states = sorted(structure.states, key=lambda s: (len(s), sorted(s))) with open(path, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["state_size", "state_id", *items]) for idx, state in enumerate(sorted_states): writer.writerow( [ str(len(state)), str(idx), *("1" if item in state else "0" for item in items), ] )