Source code for knowledgespaces.assessment.instances

"""
Instance pool: multiple questions per item.

In KST assessment, each item (competency) can be tested through
multiple instances (concrete questions). The adaptive engine selects
the best instance, maps it to its parent item for the BLIM update,
and excludes only that specific instance from future selection.

This module provides the InstancePool data structure and an
instance-aware version of select_item_eig.
"""

from __future__ import annotations

import random
from dataclasses import dataclass

import numpy as np

from knowledgespaces.assessment.blim import StatePosterior, shannon_entropy


[docs] @dataclass(frozen=True) class Instance: """A concrete question that tests a specific item. Parameters ---------- id : str Unique identifier for this instance. item : str The item (competency) this instance tests. """ id: str item: str
[docs] class InstancePool: """A collection of instances mapped to items. Parameters ---------- instances : list[Instance] All available instances. Raises ------ ValueError If instance IDs are not unique, or if an instance references an item not in the provided domain. """ def __init__(self, instances: list[Instance]) -> None: seen: set[str] = set() for inst in instances: if inst.id in seen: raise ValueError(f"Duplicate instance ID: '{inst.id}'.") seen.add(inst.id) self._instances: dict[str, Instance] = {inst.id: inst for inst in instances} self._by_item: dict[str, list[str]] = {} for inst in instances: self._by_item.setdefault(inst.item, []).append(inst.id) @property def items(self) -> set[str]: """All unique items covered by this pool.""" return set(self._by_item.keys()) @property def instance_ids(self) -> set[str]: return set(self._instances.keys())
[docs] def item_of(self, instance_id: str) -> str: """Get the parent item of an instance.""" return self._instances[instance_id].item
[docs] def instances_for(self, item: str) -> list[str]: """Get all instance IDs for a given item.""" return list(self._by_item.get(item, []))
def n_instances(self) -> int: return len(self._instances)
[docs] def validate_domain(self, domain: frozenset[str]) -> None: """Verify that pool items match the structure's domain. Raises ------ ValueError If pool contains items not in domain, or domain has items with no instances. """ extra = self.items - domain if extra: raise ValueError(f"InstancePool contains items not in the domain: {extra}") missing = domain - self.items if missing: raise ValueError(f"Domain items have no instances in the pool: {missing}")
[docs] @classmethod def from_dict(cls, mapping: dict[str, list[str]]) -> InstancePool: """Create from {item: [instance_id, ...]}. Example:: pool = InstancePool.from_dict({ "addition": ["add_q1", "add_q2", "add_q3"], "subtraction": ["sub_q1", "sub_q2"], }) """ instances = [] for item, ids in mapping.items(): for inst_id in ids: instances.append(Instance(id=inst_id, item=item)) return cls(instances)
def __len__(self) -> int: return len(self._instances)
[docs] @dataclass(frozen=True) class InstanceScore: """Score of an instance under the EIG policy.""" instance_id: str item: str score: float
[docs] def select_instance_eig( posterior: StatePosterior, pool: InstancePool, asked: set[str] | None = None, ) -> InstanceScore: """Select the instance maximizing Expected Information Gain. This is the instance-aware version of select_item_eig. It computes EIG per item, picks the item with highest EIG, then selects a random un-asked instance of that item (since instances of the same item are equivalent from the BLIM perspective). Parameters ---------- posterior : StatePosterior Current state distribution. pool : InstancePool Available instances. asked : set[str] or None Instance IDs already asked (excluded from selection). Returns ------- InstanceScore The best instance, its parent item, and EIG score. Raises ------ ValueError If no un-asked instances remain, or if pool items don't match the structure's domain. """ asked = asked or set() blim = posterior.blim pool.validate_domain(blim.structure.domain) # Compute EIG per item (once), then map to best available instance current_entropy = shannon_entropy(posterior.probabilities) item_eig: dict[str, float] = {} for item in sorted(blim.structure.domain): lh_correct = blim.likelihood_vector(item, True) prob_correct = float(np.sum(lh_correct * posterior.probabilities)) post_c = lh_correct * posterior.probabilities sum_c = post_c.sum() if sum_c > 0: post_c = post_c / sum_c lh_incorrect = blim.likelihood_vector(item, False) post_i = lh_incorrect * posterior.probabilities sum_i = post_i.sum() if sum_i > 0: post_i = post_i / sum_i expected_entropy = prob_correct * shannon_entropy(post_c) + ( 1 - prob_correct ) * shannon_entropy(post_i) item_eig[item] = current_entropy - expected_entropy # Find the best un-asked instance (random among equivalents) best: InstanceScore | None = None for item in sorted(item_eig, key=lambda i: item_eig[i], reverse=True): available = [iid for iid in pool.instances_for(item) if iid not in asked] if available: chosen = random.choice(available) candidate = InstanceScore( instance_id=chosen, item=item, score=item_eig[item], ) if best is None or candidate.score > best.score: best = candidate if best is None: raise ValueError("No un-asked instances available.") return best