При использовании Custom Extractor представление схемы Python API не обеспечивает доступ к EntityTypes; так и должно быт

При использовании Custom Extractor представление схемы Python API не обеспечивает доступ к EntityTypes; так и должно быт ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

При использовании Custom Extractor представление схемы Python API не обеспечивает доступ к EntityTypes; так и должно быт

Цитата

Сообщение Anonymous » 01 ноя 2025, 14:36

Документация API показывает, что DocumentSchema имеет дочерние элементы EntityType, которые должны содержать сведения обо всех полях в пользовательском экстракторе. Я могу получить DocumentSchema, как и ожидалось. Однако массив EntityType пуст, а не содержит все поля.
Вы можете просмотреть все поля через пользовательский интерфейс консоли:

Вот код, демонстрирующий проблему:

Код: Выделить всё

import os
from google.cloud import documentai_v1 as documentai
from google.api_core.client_options import ClientOptions

# Set the environment variable for Google Cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './google-credentials.json'

def print_schema_fields(schema):
print("\nSchema Fields:")
for entity_type in schema.entity_types:
if entity_type.name == "custom_extraction_document_type":
print(f"Entity Type Name: {entity_type.name}")
print(f"Base Types: {entity_type.base_types}")

if hasattr(entity_type, 'properties'):
print(f"Properties found: {len(entity_type.properties)}")
for property in entity_type.properties:
print(f"Property: {property.name}")
else:
print("No properties attribute found")

def get_processor_schema(client, processor_name):
processor = client.get_processor(name=processor_name)
versions = client.list_processor_versions(parent=processor.name)
latest_version = next(iter(versions), None)

if latest_version:
print(f"Processor Version: {latest_version.display_name}")
schema = getattr(latest_version, 'document_schema', None)

if schema:
print(f"Schema Name: {schema.display_name}")
print_schema_fields(schema)
else:
print("No schema found")

# Setup
project_id = 'api-pr....25020'
location = 'us'

opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)

# Get custom extractors
parent = f"projects/{project_id}/locations/{location}"
processors = client.list_processors(parent=parent)
custom_extractors = [p for p in processors if p.type_ == "CUSTOM_EXTRACTION_PROCESSOR"]

for extractor in custom_extractors:
print(f"\nProcessor: {extractor.display_name}")
get_processor_schema(client, extractor.name)

Вот выходные данные, показывающие успешное получение DocumentSchema и пустые типы сущностей:

Код: Выделить всё

Processor: TaxProcessor
Processor Version: Google Stable
Schema Name: CDE Schema

Schema Fields:
Entity Type Name: custom_extraction_document_type
Base Types: ['document']
Properties found: 0

Я неправильно использую API? Как я могу прочитать настроенные поля для пользовательского экстрактора (и могу ли я их создать/обновить?)

Подробнее здесь: https://stackoverflow.com/questions/791 ... ss-to-enti

1761997011

Anonymous

Документация API показывает, что DocumentSchema имеет дочерние элементы EntityType, которые должны содержать сведения обо всех полях в пользовательском экстракторе.  Я могу получить DocumentSchema, как и ожидалось.  Однако массив EntityType пуст, а не содержит все поля.
Вы можете просмотреть все поля через пользовательский интерфейс консоли:
[img]https://i.sstatic.net/51L9IBgH.png[/img]

Вот код, демонстрирующий проблему:
[code]import os
from google.cloud import documentai_v1 as documentai
from google.api_core.client_options import ClientOptions

# Set the environment variable for Google Cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './google-credentials.json'

def print_schema_fields(schema):
print("\nSchema Fields:")
for entity_type in schema.entity_types:
if entity_type.name == "custom_extraction_document_type":
print(f"Entity Type Name: {entity_type.name}")
print(f"Base Types: {entity_type.base_types}")

if hasattr(entity_type, 'properties'):
print(f"Properties found: {len(entity_type.properties)}")
for property in entity_type.properties:
print(f"Property: {property.name}")
else:
print("No properties attribute found")

def get_processor_schema(client, processor_name):
processor = client.get_processor(name=processor_name)
versions = client.list_processor_versions(parent=processor.name)
latest_version = next(iter(versions), None)

if latest_version:
print(f"Processor Version: {latest_version.display_name}")
schema = getattr(latest_version, 'document_schema', None)

if schema:
print(f"Schema Name: {schema.display_name}")
print_schema_fields(schema)
else:
print("No schema found")

# Setup
project_id = 'api-pr....25020'
location = 'us'

opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)

# Get custom extractors
parent = f"projects/{project_id}/locations/{location}"
processors = client.list_processors(parent=parent)
custom_extractors = [p for p in processors if p.type_ == "CUSTOM_EXTRACTION_PROCESSOR"]

for extractor in custom_extractors:
print(f"\nProcessor: {extractor.display_name}")
get_processor_schema(client, extractor.name)
[/code]
Вот выходные данные, показывающие успешное получение DocumentSchema и пустые типы сущностей:
[code]
Processor: TaxProcessor
Processor Version: Google Stable
Schema Name: CDE Schema

Schema Fields:
Entity Type Name: custom_extraction_document_type
Base Types: ['document']
Properties found: 0
[/code]
Я неправильно использую API?  Как я могу прочитать настроенные поля для пользовательского экстрактора (и могу ли я их создать/обновить?) 

Подробнее здесь: [url]https://stackoverflow.com/questions/79196878/with-custom-extractor-python-api-view-of-schema-does-not-provide-access-to-enti[/url]