Add NER model variant with required fields

adrianeboyd · adrianeboyd · commit e5a0db638410 · 2024-09-22T07:49:13.000+02:00
In order to have an NER model that is simpler for internal regex/CFG
representations, add an NER variant that requires all fields and does
not include a default value.

In particular, this makes it possible to evaluate a version of NER for
Outlines and provides an additional point of comparison for other
libraries.
diff --git a/config.yaml b/config.yaml
@@ -149,6 +149,18 @@ OutlinesFramework:
       retries: 0 # Oulines transformers has no retry parameter
       source_data_pickle_path: "data/multilabel_classification.pkl"
       # sample_rows: 2
+  - task: "ner_required_fields"
+    n_runs: 10
+    init_kwargs:
+      prompt: "Extract and resolve a list of entities from the following text: {text}"
+      # switch to prompt with JSON schema when supported:
+      #prompt: "Extract and resolve a list of entities from the following text: {text}.\nRespond in the following json schema: {json_schema}:\n"
+      llm_model: "unsloth/llama-3-8b-Instruct-bnb-4bit"
+      llm_model_family: "transformers"
+      retries: 0
+      source_data_pickle_path: "data/ner.pkl"
+      max_length: 4096
+      #sample_rows: 2
   - task: "synthetic_data_generation"
     n_runs: 100
     init_kwargs:
@@ -177,6 +189,16 @@ LMFormatEnforcerFramework:
       source_data_pickle_path: "data/ner.pkl"
       max_length: 4096
       # sample_rows: 2
+  - task: "ner_required_fields"
+    n_runs: 10
+    init_kwargs:
+      prompt: "Extract and resolve a list of entities from the following text: {text}.\nRespond in the following json schema: {json_schema}:\n"
+      llm_model: "unsloth/llama-3-8b-Instruct-bnb-4bit"
+      llm_model_family: "transformers"
+      retries: 0
+      source_data_pickle_path: "data/ner.pkl"
+      max_length: 4096
+      # sample_rows: 2
   - task: "synthetic_data_generation"
     n_runs: 100
     init_kwargs:
@@ -234,4 +256,4 @@ ModelsmithFramework:
 #       llm_model_family: "transformers"
 #       retries: 0 # Oulines transformers has no retry parameter
 #       source_data_pickle_path: "data/ner.pkl"
-#       # sample_rows: 2
+#       # sample_rows: 2
diff --git a/data_sources/data_models.py b/data_sources/data_models.py
@@ -2,7 +2,7 @@
 from enum import Enum
 from typing import Any, Optional, Type
 
-from pydantic import BaseModel, create_model, field_validator
+from pydantic import BaseModel, Field, create_model, field_validator
 from pydantic_core import PydanticUndefined
 
 
@@ -24,6 +24,14 @@ def ner_model(ner_entities):
     return NER
 
 
+def ner_required_fields_model(ner_entities):
+    fields = {name: (list[str], Field(description="")) for name in ner_entities}
+
+    NERRequiredFields = create_model("NERRequiredFields", **fields)
+
+    return NERRequiredFields
+
+
 def synthetic_data_generation_model():
     class UserAddress(BaseModel):
         street: str
diff --git a/frameworks/base.py b/frameworks/base.py
@@ -12,6 +12,7 @@
 from data_sources.data_models import (
     multilabel_classification_model,
     ner_model,
+    ner_required_fields_model,
     synthetic_data_generation_model,
 )
 
@@ -82,7 +83,7 @@ def experiment(
 
     def experiment_decorator(func):
         def wrapper(*args, **kwargs):
-            allowed_tasks = ["multilabel_classification", "ner", "synthetic_data_generation"]
+            allowed_tasks = ["multilabel_classification", "ner", "ner_required_fields", "synthetic_data_generation"]
             if task not in allowed_tasks:
                 raise ValueError(
                     f"{task} is not allowed. Allowed values are {allowed_tasks}"
@@ -119,7 +120,7 @@ def wrapper(*args, **kwargs):
                 framework_metrics = {
                     "accuracy": accurate / num_successful if num_successful else 0
                 }
-            elif task == "ner":
+            elif task in ("ner", "ner_required_fields"):
                 framework_metrics = []
                 for response in responses:
                     framework_metrics.append(calculate_metrics(expected_response, response))
@@ -191,6 +192,14 @@ def __init__(self, *args, **kwargs) -> None:
 
             self.response_model = ner_model(self.entities)
 
+        elif self.task == "ner_required_fields":
+            # Identify the entities
+            self.entities = list(
+                {key for d in self.source_data["labels"] for key in d.keys()}
+            )
+
+            self.response_model = ner_required_fields_model(self.entities)
+
         elif self.task == "synthetic_data_generation":
             self.response_model = synthetic_data_generation_model()
 
diff --git a/main.py b/main.py
@@ -95,7 +95,7 @@ def generate_results(
     task: str = "multilabel_classification",
 ):
 
-    allowed_tasks = ["multilabel_classification", "ner", "synthetic_data_generation"]
+    allowed_tasks = ["multilabel_classification", "ner", "ner_required_fields", "synthetic_data_generation"]
     if task not in allowed_tasks:
         raise ValueError(f"{task} is not allowed. Allowed values are {allowed_tasks}")
     
@@ -127,7 +127,7 @@ def generate_results(
     logger.info(f"Latencies:\n{metrics.latency_metric(latencies, 95)}")
 
     # NER Micro Metrics
-    if task == "ner":
+    if task in ("ner", "ner_required_fields"):
         micro_metrics_df = metrics.ner_micro_metrics(results)
         logger.info(f"NER Micro Metrics:\n{micro_metrics_df}")