Используйте S3 в качестве источника для создания таблиц айсберга.

Используйте S3 в качестве источника для создания таблиц айсберга. ⇐ Apache

1 сообщение • Страница 1 из 1

Anonymous

Используйте S3 в качестве источника для создания таблиц айсберга.

Цитата

Сообщение Anonymous » 19 ноя 2024, 04:07

В настоящее время я работаю над небольшой установкой, в которой у меня есть базовая настройка Iceberg, Trino, Hive Metastore и s3.
Я могу создавать таблицы Iceberg в s3 с помощью Trino. CLI.
Теперь я хотел бы использовать образец файла паркета, хранящийся в s3, и создать таблицы айсберга, используя его данные. Я не могу понять, какую команду конфигурации или трино мне нужно передать, чтобы использовать файл паркета в s3 в качестве источника.
Любая помощь по этому поводу была бы полезна. ТИА!
my-trino.yaml
image:
tag: "463"
pullPolicy: IfNotPresent
server:
workers: 2
config:
properties:
# S3 configurations
"fs.s3.aws.credentials.provider": "org.apache.hadoop.fs.s3a.WebIdentityTokenCredentialsProvider"
"fs.s3.endpoint": "s3.us-west-2.amazonaws.com"
"fs.s3.region": "us-west-1"
"fs.s3.use-instance-credentials": "false"
"fs.s3.use-web-identity-token-credentials-provider": "true"
"fs.s3.path-style-access": "true"
# Iceberg configurations
"iceberg.max-splits-per-scan": "1"
# Discovery configurations
"discovery.uri": "http://trino-coordinator:8080"
serviceAccount:
create: false
name: trino-service-account
coordinator:
service:
type: LoadBalancer
port: 8080
name: trino-coordinator
jvm:
maxHeapSize: "3G"
resources:
limits:
memory: "4Gi"
cpu: "2"
requests:
memory: "2Gi"
cpu: "1"
worker:
service:
type: ClusterIP
name: trino-worker
jvm:
maxHeapSize: "3G"
-XX:+ExitOnOutOfMemoryError: ""
-XX:+HeapDumpOnOutOfMemoryError: ""
-XX:HeapDumpPath: "/tmp/dump.hprof"
config:
discovery.uri: "http://trino-coordinator:8080"
resources:
limits:
memory: "4Gi"
cpu: "2"
requests:
memory: "2Gi"
cpu: "1"
catalogs:
iceberg: |-
connector.name=iceberg
hive.metastore.uri=thrift://hivems-hive-metastore.arvind.svc.cluster.local:9083
iceberg.catalog.type=hive_metastore
s3.aws-access-key=******
s3.aws-secret-key=******
s3.path-style-access=true
fs.native-s3.enabled=true
iceberg.unique-table-location=true
hive: |-
connector.name=hive
hive.metastore.uri=thrift://hivems-hive-metastore.arvind.svc.cluster.local:9083
hive.non-managed-table-writes-enabled=true

my-values.yaml
# The base hadoop image to use for all components.
# See this repo for image build details: https://github.com/Comcast/kube-yarn/tree/master/image
postgresql:
postgresqlUsername: hive
postgresqlPassword: hive
postgresqlDatabase: metastore

initdbScriptsConfigMap: hive-metastore-postgresql-init
image:
repository: jboothomas/hive-metastore-s3
tag: v6
pullPolicy: IfNotPresent

resources: {}
conf:
hiveSite:
hive_metastore_uris: thrift://hivems-hive-metastore:9083
fs.s3a.access.key: *******
fs.s3a.secret.key: *******
hive.metastore.warehouse.dir: s3a://my-iceberg-trino-bucket/iceberg-warehouse
fs.s3a.connection.ssl.enabled: false

Подробнее здесь: https://stackoverflow.com/questions/792 ... erg-tables

1731978440

Anonymous

В настоящее время я работаю над небольшой установкой, в которой у меня есть базовая настройка Iceberg, Trino, Hive Metastore и s3.
Я могу создавать таблицы Iceberg в s3 с помощью Trino. CLI.
Теперь я хотел бы использовать образец файла паркета, хранящийся в s3, и создать таблицы айсберга, используя его данные. Я не могу понять, какую команду конфигурации или трино мне нужно передать, чтобы использовать файл паркета в s3 в качестве источника.
Любая помощь по этому поводу была бы полезна. ТИА!
[b]my-trino.yaml[/b]
image:
tag: "463"
pullPolicy: IfNotPresent
server:
workers: 2
config:
properties:
# S3 configurations
"fs.s3.aws.credentials.provider": "org.apache.hadoop.fs.s3a.WebIdentityTokenCredentialsProvider"
"fs.s3.endpoint": "s3.us-west-2.amazonaws.com"
"fs.s3.region": "us-west-1"
"fs.s3.use-instance-credentials": "false"
"fs.s3.use-web-identity-token-credentials-provider": "true"
"fs.s3.path-style-access": "true"
# Iceberg configurations
"iceberg.max-splits-per-scan": "1"
# Discovery configurations
"discovery.uri": "http://trino-coordinator:8080"
serviceAccount:
create: false
name: trino-service-account
coordinator:
service:
type: LoadBalancer
port: 8080
name: trino-coordinator
jvm:
maxHeapSize: "3G"
resources:
limits:
memory: "4Gi"
cpu: "2"
requests:
memory: "2Gi"
cpu: "1"
worker:
service:
type: ClusterIP
name: trino-worker
jvm:
maxHeapSize: "3G"
-XX:+ExitOnOutOfMemoryError: ""
-XX:+HeapDumpOnOutOfMemoryError: ""
-XX:HeapDumpPath: "/tmp/dump.hprof"
config:
discovery.uri: "http://trino-coordinator:8080"
resources:
limits:
memory: "4Gi"
cpu: "2"
requests:
memory: "2Gi"
cpu: "1"
catalogs:
iceberg: |-
connector.name=iceberg
hive.metastore.uri=thrift://hivems-hive-metastore.arvind.svc.cluster.local:9083
iceberg.catalog.type=hive_metastore
s3.aws-access-key=******
s3.aws-secret-key=******
s3.path-style-access=true
fs.native-s3.enabled=true
iceberg.unique-table-location=true
hive: |-
connector.name=hive
hive.metastore.uri=thrift://hivems-hive-metastore.arvind.svc.cluster.local:9083
hive.non-managed-table-writes-enabled=true

[b]my-values.yaml[/b]
# The base hadoop image to use for all components.
# See this repo for image build details: https://github.com/Comcast/kube-yarn/tree/master/image
postgresql:
postgresqlUsername: hive
postgresqlPassword: hive
postgresqlDatabase: metastore

initdbScriptsConfigMap: hive-metastore-postgresql-init
image:
repository: jboothomas/hive-metastore-s3
tag: v6
pullPolicy: IfNotPresent

resources: {}
conf:
hiveSite:
hive_metastore_uris: thrift://hivems-hive-metastore:9083
fs.s3a.access.key: *******
fs.s3a.secret.key: *******
hive.metastore.warehouse.dir: s3a://my-iceberg-trino-bucket/iceberg-warehouse
fs.s3a.connection.ssl.enabled: false
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79201830/use-s3-as-the-source-to-create-iceberg-tables[/url]