Source code for knowledgespaces.assessment.instances

"""
Instance pool: multiple questions per item.

In KST assessment, each item (competency) can be tested through
multiple instances (concrete questions). The adaptive engine selects
the best instance, maps it to its parent item for the BLIM update,
and excludes only that specific instance from future selection.

This module provides the InstancePool data structure and an
instance-aware version of select_item_eig.
"""

from __future__ import annotations

import random
from dataclasses import dataclass

import numpy as np

from knowledgespaces.assessment.blim import StatePosterior, shannon_entropy



[docs]
@dataclass(frozen=True)
class Instance:
    """A concrete question that tests a specific item.

    Parameters
    ----------
    id : str
        Unique identifier for this instance.
    item : str
        The item (competency) this instance tests.
    """

    id: str
    item: str




[docs]
class InstancePool:
    """A collection of instances mapped to items.

    Parameters
    ----------
    instances : list[Instance]
        All available instances.

    Raises
    ------
    ValueError
        If instance IDs are not unique, or if an instance references
        an item not in the provided domain.
    """

    def __init__(self, instances: list[Instance]) -> None:
        seen: set[str] = set()
        for inst in instances:
            if inst.id in seen:
                raise ValueError(f"Duplicate instance ID: '{inst.id}'.")
            seen.add(inst.id)
        self._instances: dict[str, Instance] = {inst.id: inst for inst in instances}
        self._by_item: dict[str, list[str]] = {}
        for inst in instances:
            self._by_item.setdefault(inst.item, []).append(inst.id)

    @property
    def items(self) -> set[str]:
        """All unique items covered by this pool."""
        return set(self._by_item.keys())

    @property
    def instance_ids(self) -> set[str]:
        return set(self._instances.keys())


[docs]
    def item_of(self, instance_id: str) -> str:
        """Get the parent item of an instance."""
        return self._instances[instance_id].item



[docs]
    def instances_for(self, item: str) -> list[str]:
        """Get all instance IDs for a given item."""
        return list(self._by_item.get(item, []))


    def n_instances(self) -> int:
        return len(self._instances)


[docs]
    def validate_domain(self, domain: frozenset[str]) -> None:
        """Verify that pool items match the structure's domain.

        Raises
        ------
        ValueError
            If pool contains items not in domain, or domain has items
            with no instances.
        """
        extra = self.items - domain
        if extra:
            raise ValueError(f"InstancePool contains items not in the domain: {extra}")
        missing = domain - self.items
        if missing:
            raise ValueError(f"Domain items have no instances in the pool: {missing}")



[docs]
    @classmethod
    def from_dict(cls, mapping: dict[str, list[str]]) -> InstancePool:
        """Create from {item: [instance_id, ...]}.

        Example::

            pool = InstancePool.from_dict({
                "addition": ["add_q1", "add_q2", "add_q3"],
                "subtraction": ["sub_q1", "sub_q2"],
            })
        """
        instances = []
        for item, ids in mapping.items():
            for inst_id in ids:
                instances.append(Instance(id=inst_id, item=item))
        return cls(instances)


    def __len__(self) -> int:
        return len(self._instances)




[docs]
@dataclass(frozen=True)
class InstanceScore:
    """Score of an instance under the EIG policy."""

    instance_id: str
    item: str
    score: float




[docs]
def select_instance_eig(
    posterior: StatePosterior,
    pool: InstancePool,
    asked: set[str] | None = None,
) -> InstanceScore:
    """Select the instance maximizing Expected Information Gain.

    This is the instance-aware version of select_item_eig. It computes
    EIG per item, picks an item with the highest EIG (ties broken at
    random), then selects a random un-asked instance of that item (since
    instances of the same item are equivalent from the BLIM perspective).

    Parameters
    ----------
    posterior : StatePosterior
        Current state distribution.
    pool : InstancePool
        Available instances.
    asked : set[str] or None
        Instance IDs already asked (excluded from selection).

    Returns
    -------
    InstanceScore
        The best instance, its parent item, and EIG score.

    Raises
    ------
    ValueError
        If no un-asked instances remain, or if pool items don't match
        the structure's domain.
    """
    asked = asked or set()
    blim = posterior.blim
    pool.validate_domain(blim.structure.domain)

    # Compute EIG per item (once), then map to best available instance
    current_entropy = shannon_entropy(posterior.probabilities)
    item_eig: dict[str, float] = {}

    for item in sorted(blim.structure.domain):
        lh_correct = blim.likelihood_vector(item, True)
        prob_correct = float(np.sum(lh_correct * posterior.probabilities))

        post_c = lh_correct * posterior.probabilities
        sum_c = post_c.sum()
        if sum_c > 0:
            post_c = post_c / sum_c

        lh_incorrect = blim.likelihood_vector(item, False)
        post_i = lh_incorrect * posterior.probabilities
        sum_i = post_i.sum()
        if sum_i > 0:
            post_i = post_i / sum_i

        expected_entropy = prob_correct * shannon_entropy(post_c) + (
            1 - prob_correct
        ) * shannon_entropy(post_i)
        item_eig[item] = current_entropy - expected_entropy

    # Among items that still have an un-asked instance, take those at the
    # maximum EIG and break ties at random — otherwise items that sort first
    # would be deterministically favoured among equals (see docstring).
    available_items = [
        item for item in item_eig if any(iid not in asked for iid in pool.instances_for(item))
    ]
    if not available_items:
        raise ValueError("No un-asked instances available.")

    max_eig = max(item_eig[item] for item in available_items)
    top_items = [item for item in available_items if np.isclose(item_eig[item], max_eig)]
    item = random.choice(top_items)
    available = [iid for iid in pool.instances_for(item) if iid not in asked]
    chosen = random.choice(available)
    return InstanceScore(instance_id=chosen, item=item, score=item_eig[item])