Однако мне сложно автоматически импортировать большие объемы данных в свое хранилище в виде файлов Markdown. В частности, я хочу импортировать книги из списков Goodreads (например: «Самые тревожные книги из когда-либо написанных»), включая:
- Название
- Автор
- Обложку (coverUrl)
- Средний рейтинг
- Жанр
Я пробовал использовать запросы + BeautifulSoup, но теперь Goodreads динамически загружает содержимое списка с помощью JavaScript, поэтому мой скрипт не находит ни одной книги. Я перешел на Selenium, который работает для очистки страницы, но мне интересно, есть ли более эффективный, Pythonic или масштабируемый способ сделать это, особенно если я хочу позже импортировать несколько списков, сотни книг или другие типы медиа (фильмы, манга и т. д.).
Я был бы очень признателен за любые рекомендации, примеры или библиотеки, которые могли бы мне помочь:
- Scrape Goodreads надежные списки, даже с содержимым, загруженным на JS.
- Экспортируйте каждый элемент в Markdown с помощью Frontmatter для Obsidian.
- Эффективно обрабатывайте большие коллекции (избегайте дублирования, контрольные точки)
dataviewjs
/=== CONFIGURAÇÃO ===/
const folders = [
`"3- Bem estar/Hobbies e Inspirações/Coleções"`,
`"2- Maestria/Conhecimento"`,
`"1- Acadêmico"`
];
const query = folders.join(" or ");
const cores = {
"Livros": "#1D4ED8",
"Mangás": "#F43F5E",
"HQs": "#F97316",
"Webtoons": "#06B6D4",
"Manhwa": "#8B5CF6",
"Filmes": "#DC2626",
"Séries": "#2563EB",
"Músicas": "#EAB308",
"Arte": "#10B981",
"Esculturas": "#78716C",
"Obras de Arte": "#A855F7",
"Animals": "#84CC16",
"Creatures": "#9333EA",
"Jogos de Tabuleiro": "#B45309",
"Video Games": "#0EA5E9",
"Wiki": "#15803D",
"Jogos": "#3B82F6",
"Documentos": "#64748B",
"Outros": "#9CA3AF",
"Área Acadêmica": "#0D9488",
"Área de Conhecimento": "#1E40AF",
"Área Profissional": "#7C3AED",
"Anotação": "#92487A",
"Reflexão": "#6B7280",
"Citação": "#9CA3AF",
"Experimento Científico": "#16A34A"
};
const PAGE_SIZE = 21;
let currentPage = 1;
let filteredItems = [];
/=== FUNÇÕES BASE ===/
function normalizeList(value) {
if (!value) return [];
return Array.isArray(value) ? value : [value];
}
function joinList(value) {
return normalizeList(value).join(", ");
}
function normalizarTipo(typeRaw) {
if (!typeRaw) return "Outros";
const t = String(typeRaw).toLowerCase().trim();
const map = {
"livro": "Livros", "livros": "Livros", "book": "Livros", "books": "Livros",
"manga": "Mangás", "mangá": "Mangás", "mangas": "Mangás", "mangás": "Mangás",
"hq": "HQs", "comic": "HQs", "comics": "HQs",
"webtoon": "Webtoons", "webtoons": "Webtoons",
"manhwa": "Manhwa", "manhwas": "Manhwa",
"filme": "Filmes", "movie": "Filmes", "movies": "Filmes",
"série": "Séries", "series": "Séries",
"musicrelease": "Músicas", "música": "Músicas", "musica": "Músicas", "music": "Músicas",
"arte": "Arte", "art": "Arte",
"escultura": "Esculturas", "sculpture": "Esculturas",
"obra de arte": "Obras de Arte", "artwork": "Obras de Arte",
"ser": "Animals", "animal": "Animals", "animals": "Animals",
"criatura": "Creatures", "criaturas": "Creatures", "creature": "Creatures", "creatures": "Creatures", "entity": "Creatures",
"jogo": "Jogos", "game": "Jogos", "games": "Jogos",
"jogo de tabuleiro": "Jogos de Tabuleiro", "board game": "Jogos de Tabuleiro",
"videogame": "Video Games", "video game": "Video Games",
"wiki": "Wiki", "wikipedia": "Wiki",
"documento": "Documentos", "document": "Documentos",
"acadêmico": "Área Acadêmica", "academico": "Área Acadêmica",
"área acadêmica": "Área Acadêmica", "area academica": "Área Acadêmica",
"conhecimento": "Área de Conhecimento", "área de conhecimento": "Área de Conhecimento",
"profissional": "Área Profissional", "área profissional": "Área Profissional",
"anotação": "Anotação", "note": "Anotação",
"reflexão": "Reflexão",
"citação": "Citação", "quote": "Citação",
"experimento científico": "Experimento Científico", "experimento": "Experimento Científico"
};
return map[t] ?? "Outros";
}
function normalizarSubTipo(subTypeRaw) {
if (!subTypeRaw) return "";
return subTypeRaw.charAt(0).toUpperCase() + subTypeRaw.slice(1);
}
function renderRating(nota) {
if (nota === undefined || nota === null) return "";
const r = Math.max(0, Math.min(10, Number(nota))) / 2;
const full = Math.floor(r);
const half = r - full >= 0.5 ? 1 : 0;
const empty = 5 - full - half;
return "★".repeat(full) + "⯪".repeat(half) + "☆".repeat(empty);
}
/=== CARREGAMENTO ===/
let items = [];
for (let p of dv.pages(query)) {
const type = normalizarTipo(p.type ?? "");
const subType = normalizarSubTipo(p.subType ?? "");
const creators = joinList([
...normalizeList(p.author ?? p.autor ?? p.writer),
...normalizeList(p.director ?? p.diretor),
...normalizeList(p.artist ?? p.artista)
].filter(Boolean));
items.push({
...p,
cover: p.cover ?? p.coverUrl ?? p.poster ?? "",
title: p.title ?? p.portugueseTitle ?? p.englishTitle ?? p.file.name,
portugueseTitle: p.portugueseTitle ?? "",
englishTitle: p.englishTitle ?? "",
onlineRating: p.onlineRating ?? null,
creators,
type,
subType,
status: String(p.status ?? "").trim(),
series: p.series ?? "",
rating: p.rating ?? null,
created: p.file.ctime
});
}
/=== UI CONTROLS ===/
dv.container.innerHTML = `
Todos os tipos
Todos os subtipos
Todos os status
Todas
Com Imagem
Sem Imagem
Título
Data de criação
Nome do arquivo
Crescente ↑
Decrescente ↓
Resetar
1
Próxima página
`;
/=== POPULA FILTROS ===/
function populateFilters() {
const typeSelect = document.getElementById("dv-g-type");
[...new Set(items.map(i => i.type))].forEach(t => typeSelect.insertAdjacentHTML('beforeend', `${t}`));
const subSelect = document.getElementById("dv-g-subtype");
[...new Set(items.map(i => i.subType).filter(Boolean))].forEach(t => subSelect.insertAdjacentHTML('beforeend', `${t}`));
const statusSelect = document.getElementById("dv-g-status");
[...new Set(items.map(i => i.status).filter(Boolean))].forEach(t => statusSelect.insertAdjacentHTML('beforeend', `${t}`));
}
/=== RENDER GRID ===/
function render(list) {
const grid = document.getElementById("dv-gallery-grid");
grid.innerHTML = "";
const start = (currentPage - 1) * PAGE_SIZE;
const end = start + PAGE_SIZE;
const pageItems = list.slice(start, end);
pageItems.forEach(p => {
const typeColor = cores[p.type] ?? cores["Outros"];
const cover = p.cover || "
const rating = renderRating(p.rating);
const cardBg = p.subType ? `rgba(${parseInt(typeColor.slice(1,3),16)},${parseInt(typeColor.slice(3,5),16)},${parseInt(typeColor.slice(5,7),16)},0.15)`
: "var(--background-secondary)";
const tooltip = `
Tipo: ${p.type}
Subtipo: ${p.subType || "—"}
Status: ${p.status || "—"}
`.trim();
const card = document.createElement("div");
card.className = "dv-card";
card.style.border = `2px solid ${typeColor}`;
card.style.background = cardBg;
card.title = tooltip;
card.innerHTML = `
${p.type}
${p.subType ? `${p.subType}` : ""}
${p.title}
${p.creators}
${rating}${p.onlineRating ? `
${p.status ? `${p.status}` : ""}
`;
card.onclick = () => app.workspace.openLinkText(p.file.path, '/', false);
grid.appendChild(card);
});
document.getElementById("dv-g-page").textContent = currentPage;
}
/=== FILTROS E NAVEGAÇÃO ===/
function applyFilters() {
const typ = document.getElementById("dv-g-type").value;
const subtyp = document.getElementById("dv-g-subtype").value;
const stat = document.getElementById("dv-g-status").value;
const imgFilter = document.getElementById("dv-g-image").value;
const order = document.getElementById("dv-g-order").value;
const dir = document.getElementById("dv-g-direction").value;
filteredItems = items.filter(p =>
(!typ || p.type === typ) &&
(!subtyp || p.subType === subtyp) &&
(!stat || p.status === stat) &&
(!imgFilter || (imgFilter === "with" && p.cover) || (imgFilter === "without" && !p.cover))
);
filteredItems.sort((a,b)=>{
let A,B;
if(order==="created"){ A=a.created; B=b.created; }
else if(order==="fileName"){ A=a.file.name; B=b.file.name; }
else { A=a.title; B=b.title; }
return A < B ? (dir==="asc" ? -1 : 1)
: A > B ? (dir==="asc" ? 1 : -1)
: 0;
});
currentPage = 1;
render(filteredItems);
}
/=== EVENTOS ===/
setTimeout(()=>{
populateFilters();
["dv-g-type","dv-g-subtype","dv-g-status","dv-g-image","dv-g-order","dv-g-direction"].forEach(id =>
document.getElementById(id)?.addEventListener("change", applyFilters)
);
document.getElementById("dv-g-reset")?.addEventListener("click",()=>{
["dv-g-type","dv-g-subtype","dv-g-status","dv-g-image"].forEach(id => document.getElementById(id).value="");
document.getElementById("dv-g-order").value="title";
document.getElementById("dv-g-direction").value="asc";
applyFilters();
});
document.getElementById("dv-g-prev")?.addEventListener("click",()=>{
if(currentPage>1){ currentPage--; render(filteredItems); dv.container.scrollTop = 0; }
});
document.getElementById("dv-g-next")?.addEventListener("click",()=>{
if(currentPage*PAGE_SIZE dv.container.scrollTop = 0);
filteredItems = items;
render(filteredItems);
},100);
/=== CSS RESPONSIVO INJETADO ===/
const css = `
.dv-gallery-controls {display:flex; gap:8px; flex-wrap:wrap; align-items:center; margin-bottom:1em;}
.dv-grid {display:grid; gap:12px; grid-template-columns: repeat(auto-fill, minmax(140px, 1fr));}
.dv-card {border-radius:10px; overflow:hidden; cursor:pointer; transition: transform 0.2s, box-shadow 0.2s; display:flex; flex-direction:column; background: var(--background-secondary);}
.dv-card:hover {transform: translateY(-4px); box-shadow:0 6px 16px rgba(0,0,0,0.2);}
.dv-card-media {position:relative; width:100%; height:0; padding-bottom:150%;}
.dv-card-media img {position:absolute; top:0; left:0; width:100%; height:100%; object-fit:cover; border-bottom:1px solid rgba(0,0,0,0.1);}
.dv-badges {position:absolute; top:6px; left:6px; display:flex; flex-direction:column; gap:4px;}
.dv-card-badge, .dv-card-subtype {padding:3px 7px; font-size:0.72rem; border-radius:4px; font-weight:600;}
.dv-card-body {padding:8px 6px; text-align:center; display:flex; flex-direction:column; gap:4px;}
.dv-card-title {font-weight:600; font-size:0.9rem; line-height:1.1rem; word-break:break-word;}
.dv-card-sub {font-size:0.75rem; opacity:0.7;}
.dv-card-rating {font-size:0.8rem;}
.dv-card-status {font-size:0.75rem; opacity:0.7;}
.dv-gallery-pages {display:flex; gap:8px; align-items:center; margin-top:1em;}
@media (max-width:768px) {.dv-grid {grid-template-columns: repeat(auto-fill, minmax(100px, 1fr));}}
@media (max-width:480px) {.dv-grid {grid-template-columns: repeat(auto-fill, minmax(80px, 1fr));} .dv-card-title {font-size:0.8rem;} .dv-card-sub {font-size:0.65rem;} .dv-card-rating {font-size:0.7rem;}}
`;
if(!document.getElementById("dv-gallery-style")) {
const style = document.createElement("style");
style.id="dv-gallery-style";
style.innerHTML = css;
document.head.appendChild(style);
}
Текущий код импорта Python:
# -*- coding: utf-8 -*-
import os
import requests
from bs4 import BeautifulSoup
import yaml
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
OUTPUT_DIR = r"C:\Users\Usuario\Documents\Gnosis\3- Bem estar\Hobbies e Inspirações\Coleções\Leituras\Livros"
os.makedirs(OUTPUT_DIR, exist_ok=True)
CHECKPOINT_FILE = os.path.join(OUTPUT_DIR, "checkpoint.json")
MAX_WORKERS = 5
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
def safe_filename(s):
return s.replace("/", "-").replace("\\", "-").replace(":", "-").strip()
def write_md(filename, yaml_obj, body_md=""):
path = os.path.join(OUTPUT_DIR, filename)
if os.path.exists(path):
return
with open(path, "w", encoding="utf-8") as f:
f.write("---\n")
f.write(yaml.safe_dump(yaml_obj, sort_keys=False, allow_unicode=True))
f.write("---\n\n")
f.write(body_md)
def scrape_list_page(url):
resp = requests.get(url, headers=HEADERS)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
books = []
for book_row in soup.select("div.elementList"):
title_tag = book_row.select_one("a.bookTitle span")
author_tag = book_row.select_one("a.authorName span")
rating_tag = book_row.select_one("span.minirating")
cover_tag = book_row.select_one("img.bookCover")
if not title_tag or not author_tag:
continue
title = title_tag.get_text(strip=True)
author = author_tag.get_text(strip=True)
avg_rating = rating_tag.get_text().split(" avg rating")[0].strip() if rating_tag else None
cover_url = cover_tag['src'] if cover_tag and cover_tag.has_attr('src') else None
books.append({
"title": title,
"author": author,
"average_rating": avg_rating,
"image_url": cover_url,
"genre": "Terror/Horror",
})
return books
def get_all_books_from_list(list_url):
books = []
page = 1
while True:
url = f"{list_url}?page={page}"
print(f"[INFO] Scraping {url}")
page_books = scrape_list_page(url)
if not page_books:
break
books.extend(page_books)
page += 1
return books
def process_book(book, processed_set):
title = book.get("title")
autor = book.get("author")
uid = f"{title}_{autor}"
if uid in processed_set:
print("[SKIP]", title)
return None
yaml_obj = {
"title": {title: None},
"portugueseTitle": {title: None},
"englishTitle": {title: None},
"coverUrl": {book.get("image_url"): None},
"onlineRating": {book.get("average_rating") or "Desconhecido": None},
"type": "Livros",
"subType": {"Terror": None},
"status": {"Desconhecido": None},
"rating": {"Desconhecido": None},
"autor": {autor: None},
}
fname = safe_filename(f"{title}.md")
md_body = f"# {title}\n\n**Autor:** {autor}\n\n**Gênero:** Terror\n\n**Rating online:** {book.get('average_rating')}\n\n"
if book.get("image_url"):
md_body += f"})\n"
write_md(fname, yaml_obj, md_body)
print("[OK]", title)
return uid
def main():
list_url = "https://www.goodreads.com/list/show/245 ... er_Written"
books = get_all_books_from_list(list_url)
print(f"[INFO] Found {len(books)} books.")
if os.path.exists(CHECKPOINT_FILE):
with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
processed = set(json.load(f))
else:
processed = set()
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:
futures = [pool.submit(process_book, b, processed) for b in books]
for future in as_completed(futures):
uid = future.result()
if uid:
processed.add(uid)
with open(CHECKPOINT_FILE, "w", encoding="utf-8") as f:
json.dump(list(processed), f, ensure_ascii=False, indent=2)
print("[INFO] Import finished.")
if __name__ == "__main__":
main()
Подробнее здесь: https://stackoverflow.com/questions/798 ... -efficient
Мобильная версия