Skip to content
Snippets Groups Projects
Commit 837418a8 authored by Marcus Johansson's avatar Marcus Johansson
Browse files

Issue #3465221 by Marcus_Johansson: Add Embeddings type and figure out Title...

parent 5e011297
No related branches found
No related tags found
1 merge request!36Issue #3465221 by Marcus_Johansson: Add Embeddings type and figure out Title...
Pipeline #240618 passed with warnings
......@@ -6,6 +6,7 @@ use Drupal\ai\AiProviderPluginManager;
use Drupal\ai\Plugin\ProviderProxy;
use Drupal\ai_search\EmbeddingStrategyInterface;
use Drupal\ai\Utility\TextChunker;
use Drupal\Core\Entity\EntityTypeManager;
use Drupal\Core\Plugin\ContainerFactoryPluginInterface;
use Drupal\Core\StringTranslation\StringTranslationTrait;
use League\HTMLToMarkdown\Converter\TableConverter;
......@@ -75,6 +76,7 @@ abstract class EmbeddingStrategyPluginBase
protected AiProviderPluginManager $aiProviderManager,
protected HtmlConverter $converter,
protected TextChunker $textChunker,
protected EntityTypeManager $entityTypeManager,
) {
// Set the default converter settings.
$this->converter->getConfig()->setOption('strip_tags', true);
......@@ -125,6 +127,7 @@ abstract class EmbeddingStrategyPluginBase
$ai_provider,
new HtmlConverter(),
$text_chunker,
$container->get('entity_type.manager'),
);
}
......
......@@ -89,22 +89,30 @@ implements EmbeddingStrategyInterface {
$metadata = '';
$main_fields = '';
foreach ($fields as $field) {
// The fields original comes from the Search API
// ItemInterface::getFields() method. Ensure that is still the case.
if (!$field instanceof FieldInterface) {
continue;
}
// Get the label field.
$entity = $field->getDatasource();
$entity_type = $this->entityTypeManager->getDefinition($entity->getEntityTypeId());
$label_key = $entity_type->getKey('label');
$value = $this->compositeValues($field);
// TODO: Put title field name into configuration, do not assume 'title'.
if (($field->getLabel() == 'title') && (strlen($value) < 255)) {
// The title field.
if ($field->getFieldIdentifier() == $label_key) {
$title = $value;
} elseif (strlen($value) < 255) {
$metadata .= $field->getLabel() . ": " . $value . "\n\n";
} else {
}
// The embeddings fields.
elseif ($field->getType() == 'embeddings') {
$main_fields .= $value . "\n\n";
}
// Everything else is metadata.
else {
$metadata .= $field->getLabel() . ": " . $value . "\n\n";
}
}
return [
$title,
......@@ -137,7 +145,7 @@ implements EmbeddingStrategyInterface {
if (strlen($title . $main_fields . $metadata) <= $this->chunkSize) {
// Ideal situation, all fits min single embedding.
$chunks = $this->textChunker->chunkText(
"# " . strtoupper($title) . "\n\n" . $main_fields . "\n\n" . $metadata,
$this->prepareChunkText($title, $main_fields, $metadata),
$this->chunkSize,
$this->chunkMinOverlap
);
......@@ -151,7 +159,7 @@ implements EmbeddingStrategyInterface {
$this->chunkMinOverlap
);
foreach ($main_chunks as $main_chunk) {
$chunks[] = '# ' . strtoupper($title) . "\n\n" . $main_chunk . "\n\n" . $metadata;
$chunks[] = $this->prepareChunkText($title, $main_chunk, $metadata);
}
} else {
// Both metadata and main fields need chunking.
......@@ -170,7 +178,7 @@ implements EmbeddingStrategyInterface {
);
foreach ($main_chunks as $main_chunk) {
foreach ($metadata_chunks as $metadata_chunk) {
$chunks[] = '# ' . strtoupper($title) . "\n\n" . $main_chunk . "\n\n" . $metadata_chunk;
$chunks[] = $this->prepareChunkText($title, $main_chunk, $metadata_chunk);
}
}
}
......@@ -178,6 +186,32 @@ implements EmbeddingStrategyInterface {
return $chunks;
}
/**
* Render the chunks.
*
* @param string $title
* The title content.
* @param string $main_chunk
* The main field content.
* @param string $metadata_chunk
* The metadata related content.
*
* @return string
* The rendered chunk.
*/
protected function prepareChunkText(string $title, string $main_chunk, string $metadata_chunk): string {
$parts = [];
// Only render the title if it is not empty.
if (!empty($title)) {
$parts[] = '# ' . strtoupper($title);
}
$parts[] = $main_chunk;
if (!empty($metadata_chunk)) {
$parts[] = $metadata_chunk;
}
return implode("\n\n", $parts);
}
/**
* @inheritDoc
*/
......
......@@ -275,6 +275,16 @@ class SearchApiAiSearchBackend extends AiSearchBackendPluginBase implements Plug
}
}
/**
* {@inheritdoc}
*/
public function supportsDataType($type) {
if ($type === 'embeddings') {
return TRUE;
}
return FALSE;
}
/**
* Get the chat model options that the tokenizer supports.
*
......
<?php
namespace Drupal\ai_search\Plugin\search_api\data_type;
use Drupal\search_api\DataType\DataTypePluginBase;
/**
* Provides the embeddings data type.
*
* @SearchApiDataType(
* id = "embeddings",
* label = @Translation("Embeddings"),
* description = @Translation("LLM Vector Embeddings")
* )
*/
class Embeddings extends DataTypePluginBase {
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment