Source code for knowledgespaces.metrics.distances

"""
Distance measures between knowledge structures.

Provides formal metrics for comparing two knowledge structures:
- Symmetric difference distance
- Hausdorff distance (on states)
- Directional Hamming distances (H→A and A→H)
- Graph Edit Distance on prerequisite relations

All functions require matching domains.
"""

from __future__ import annotations

from dataclasses import dataclass

from knowledgespaces.structures.knowledge_structure import KnowledgeStructure
from knowledgespaces.structures.relations import SurmiseRelation


def _validate_same_domain_structures(
    ks1: KnowledgeStructure,
    ks2: KnowledgeStructure,
    fn_name: str,
) -> None:
    if ks1.domain != ks2.domain:
        raise ValueError(
            f"{fn_name}: structures must have the same domain. "
            f"Got {sorted(ks1.domain)} and {sorted(ks2.domain)}."
        )


def _validate_same_domain_relations(
    rel1: SurmiseRelation,
    rel2: SurmiseRelation,
    fn_name: str,
) -> None:
    if rel1.items != rel2.items:
        raise ValueError(
            f"{fn_name}: relations must have the same domain. "
            f"Got {sorted(rel1.items)} and {sorted(rel2.items)}."
        )


[docs] def symmetric_difference( ks1: KnowledgeStructure, ks2: KnowledgeStructure, ) -> int: """Number of states in exactly one of the two structures. Card(K1 △ K2) = Card(K1 \\ K2) + Card(K2 \\ K1) Raises ------ ValueError If structures have different domains. """ _validate_same_domain_structures(ks1, ks2, "symmetric_difference") only1 = ks1.states - ks2.states only2 = ks2.states - ks1.states return len(only1) + len(only2)
def _hamming(s1: frozenset[str], s2: frozenset[str]) -> int: """Hamming distance between two states (size of symmetric difference).""" return len(s1 ^ s2)
[docs] def hausdorff( ks1: KnowledgeStructure, ks2: KnowledgeStructure, ) -> int: """Hausdorff distance between two structures. max(max over K in K1 of min over L in K2 of d(K,L), max over L in K2 of min over K in K1 of d(K,L)) where d is the Hamming distance between states. Returns 0 if the structures are identical. Raises ------ ValueError If structures have different domains. """ _validate_same_domain_structures(ks1, ks2, "hausdorff") if not ks1.states or not ks2.states: return 0 def _directed(source: frozenset[frozenset[str]], target: frozenset[frozenset[str]]) -> int: return max(min(_hamming(s, t) for t in target) for s in source) d1 = _directed(ks1.states, ks2.states) d2 = _directed(ks2.states, ks1.states) return max(d1, d2)
[docs] @dataclass(frozen=True) class DirectionalDistances: """Directional Hamming distances between two structures. Attributes ---------- forward_mean : float Mean min-distance from ks1 states to nearest ks2 state. backward_mean : float Mean min-distance from ks2 states to nearest ks1 state. forward_max : int Max min-distance from ks1 to ks2. backward_max : int Max min-distance from ks2 to ks1. """ forward_mean: float backward_mean: float forward_max: int backward_max: int
[docs] def directional_distances( ks1: KnowledgeStructure, ks2: KnowledgeStructure, ) -> DirectionalDistances: """Compute directional Hamming distances. Raises ------ ValueError If structures have different domains. """ _validate_same_domain_structures(ks1, ks2, "directional_distances") def _min_distances( source: frozenset[frozenset[str]], target: frozenset[frozenset[str]] ) -> list[int]: return [min(_hamming(s, t) for t in target) for s in source] fwd = _min_distances(ks1.states, ks2.states) bwd = _min_distances(ks2.states, ks1.states) return DirectionalDistances( forward_mean=sum(fwd) / len(fwd) if fwd else 0.0, backward_mean=sum(bwd) / len(bwd) if bwd else 0.0, forward_max=max(fwd) if fwd else 0, backward_max=max(bwd) if bwd else 0, )
[docs] def graph_edit_distance( rel1: SurmiseRelation, rel2: SurmiseRelation, ) -> tuple[int, int, int]: """Edge differences between two surmise relation graphs. Returns ------- added : int Edges in rel2 but not in rel1. removed : int Edges in rel1 but not in rel2. total : int added + removed. Raises ------ ValueError If relations have different domains. """ _validate_same_domain_relations(rel1, rel2, "graph_edit_distance") # Normalize to transitive closure so that equivalent partial orders # (e.g. a→b→c vs a→b→c + a→c) produce the same result. r1 = rel1.transitive_closure().relations r2 = rel2.transitive_closure().relations added = len(r2 - r1) removed = len(r1 - r2) return added, removed, added + removed