import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import Workbook
# Step 1: Extract module names and versions from the text file
def extract_modules_and_versions(text):
version_pattern = r'(\w+Version)\s*=\s*"([\d.]+)"'
library_pattern = r'(\w+)\s*=\s*\{\s*module\s*=\s*"([\w.-]+:[\w.-]+)",\s*version.ref\s*=\s*"(\w+Version)"\s*\}'
version_matches = re.findall(version_pattern, text)
versions = {match[0]: match[1] for match in version_matches}
library_matches = re.findall(library_pattern, text)
modules = []
for match in library_matches:
module_name = match[1]
version_ref = match[2]
module_version = versions.get(version_ref)
modules.append((module_name, module_version))
return modules
# Step 2: Scrape mvnrepository.com for release date, homepage, and vulnerabilities
def get_mvnrepository_info(module, version):
group_id, artifact_id = module.split(':')
base_url = f"https://mvnrepository.com/artifact/{group_id}/{artifact_id}/{version}"
print(f"base_url: {base_url}") # Print base_url object for debugging
# Headers to mimic a real browser request to avoid 403 errors
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
# Request the URL
response = requests.get(base_url, headers=headers)
print(f"Response: {response}") # Print response object for debugging
# Check for 403 status code and print the base URL if encountered
if response.status_code == 403:
print(f"403 Forbidden - base_url: {base_url}")
return "Unknown", "No homepage available", "No known vulnerabilities", "No Categories available", "No description available"
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Extract entire table
table_tag = soup.find("table", class_="grid")
if table_tag:
# Initialize variables
print(f"table_tag: {table_tag}")
# Extract description
description = "No description available"
description_tag = soup.find("div", class_="im-description")
if description_tag:
description = description_tag.get_text(strip = True)
# Extract Categories
categories = "No Categories available"
categories_tag = soup.find("th", string="Categories")
if categories_tag:
categories = categories_tag.find_next_sibling("td").find("a").get_text(strip = True) if categories_tag.find_next_sibling("td").find("a") else categories
print(f"categories: {categories}")
# Extract homepage URL based on HomePage
homepage = "No homepage available"
homepage_tag = soup.find("th", string="HomePage")
if homepage_tag:
homepage = homepage_tag.find_next_sibling("td").find("a")['href'] if homepage_tag.find_next_sibling("td").find("a") else homepage
# Extract release date based on Date
release_date = "Unknown"
release_date_tag = soup.find("th", string="Date")
if release_date_tag:
release_date = release_date_tag.find_next_sibling("td").get_text(strip = True)
print(f"release_date: {release_date}")
# Extract vulnerabilities (Check for any known vulnerabilities)
vulnerabilities = "No known vulnerabilities"
vulnerabilities_tag = soup.find("th", string="Vulnerabilities")
if vulnerabilities_tag:
print(f"vulnerabilities found: {release_date}")
# Look for the following the "Vulnerabilities"
vulnerabilities_td = vulnerabilities_tag.find_next_sibling("td")
if vulnerabilities_td:
print(f"vulnerabilities exist: {vulnerabilities_td}")
# Find all tags with class "vuln"
vulnerability_list = vulnerabilities_td.find_all("a", class_="vuln")
if vulnerability_list:
print(f"vulnerability_list: {vulnerability_list}")
vulnerabilities = ', '.join([v.get_text(strip = True) for v in vulnerability_list])
return release_date, homepage, vulnerabilities, categories, description
else:
return "Unknown", "No homepage available", "No known vulnerabilities", "No Categories available", "No description available"
# Step 3: Generate XLSX file with extracted data
def generate_xlsx(modules_info, output_file="modules_info.xlsx"):
# Create a pandas DataFrame
df = pd.DataFrame(modules_info, columns=['Module', 'Version', 'Release Date', 'Description', 'Vulnerabilities'])
# Write the DataFrame to an Excel file
df.to_excel(output_file, index=False)
print(f"Data has been written to {output_file}")
# Main function to execute the workflow
def main():
# Step 1: Read text from the file ML.txt
with open('ML.txt', 'r') as file:
text = file.read()
# Step 2: Extract modules and versions
modules = extract_modules_and_versions(text)
# Step 3: Fetch information from mvnrepository.com and create list of module information
modules_info = []
for module, version in modules:
release_date, homepage, vulnerabilities, categories, description = get_mvnrepository_info(module, version)
# Skip recording if categories match specified ones
if categories in ["Testing Frameworks & Tools", "Logging Frameworks"]:
print(f"Skipping module {module} due to category: {categories}")
continue
modules_info.append([module, version, release_date, description, vulnerabilities])
# Step 4: Generate XLSX file
generate_xlsx(modules_info)
# Execute the main function
if __name__ == "__main__":
main()
Я пытаюсь извлечь дату выпуска конкретной версии репозитория Maven. Однако дата выпуска всегда соответствует последней версии. Вот мой код. [code]import re import requests import pandas as pd from bs4 import BeautifulSoup from openpyxl import Workbook
# Step 1: Extract module names and versions from the text file def extract_modules_and_versions(text): version_pattern = r'(\w+Version)\s*=\s*"([\d.]+)"' library_pattern = r'(\w+)\s*=\s*\{\s*module\s*=\s*"([\w.-]+:[\w.-]+)",\s*version.ref\s*=\s*"(\w+Version)"\s*\}'
version_matches = re.findall(version_pattern, text) versions = {match[0]: match[1] for match in version_matches}
library_matches = re.findall(library_pattern, text) modules = [] for match in library_matches: module_name = match[1] version_ref = match[2] module_version = versions.get(version_ref) modules.append((module_name, module_version))
return modules
# Step 2: Scrape mvnrepository.com for release date, homepage, and vulnerabilities def get_mvnrepository_info(module, version): group_id, artifact_id = module.split(':') base_url = f"https://mvnrepository.com/artifact/{group_id}/{artifact_id}/{version}" print(f"base_url: {base_url}") # Print base_url object for debugging
# Headers to mimic a real browser request to avoid 403 errors headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" }
# Request the URL response = requests.get(base_url, headers=headers) print(f"Response: {response}") # Print response object for debugging
# Check for 403 status code and print the base URL if encountered if response.status_code == 403: print(f"403 Forbidden - base_url: {base_url}") return "Unknown", "No homepage available", "No known vulnerabilities", "No Categories available", "No description available"
if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser')
# Extract description description = "No description available" description_tag = soup.find("div", class_="im-description") if description_tag: description = description_tag.get_text(strip = True)
# Extract Categories categories = "No Categories available" categories_tag = soup.find("th", string="Categories") if categories_tag: categories = categories_tag.find_next_sibling("td").find("a").get_text(strip = True) if categories_tag.find_next_sibling("td").find("a") else categories print(f"categories: {categories}")
# Extract homepage URL based on HomePage homepage = "No homepage available" homepage_tag = soup.find("th", string="HomePage") if homepage_tag: homepage = homepage_tag.find_next_sibling("td").find("a")['href'] if homepage_tag.find_next_sibling("td").find("a") else homepage
# Extract release date based on Date release_date = "Unknown" release_date_tag = soup.find("th", string="Date") if release_date_tag: release_date = release_date_tag.find_next_sibling("td").get_text(strip = True) print(f"release_date: {release_date}")
# Extract vulnerabilities (Check for any known vulnerabilities) vulnerabilities = "No known vulnerabilities" vulnerabilities_tag = soup.find("th", string="Vulnerabilities") if vulnerabilities_tag: print(f"vulnerabilities found: {release_date}") # Look for the following the "Vulnerabilities" vulnerabilities_td = vulnerabilities_tag.find_next_sibling("td") if vulnerabilities_td: print(f"vulnerabilities exist: {vulnerabilities_td}") # Find all tags with class "vuln" vulnerability_list = vulnerabilities_td.find_all("a", class_="vuln") if vulnerability_list: print(f"vulnerability_list: {vulnerability_list}") vulnerabilities = ', '.join([v.get_text(strip = True) for v in vulnerability_list])
return release_date, homepage, vulnerabilities, categories, description else: return "Unknown", "No homepage available", "No known vulnerabilities", "No Categories available", "No description available"
# Step 3: Generate XLSX file with extracted data def generate_xlsx(modules_info, output_file="modules_info.xlsx"): # Create a pandas DataFrame df = pd.DataFrame(modules_info, columns=['Module', 'Version', 'Release Date', 'Description', 'Vulnerabilities'])
# Write the DataFrame to an Excel file df.to_excel(output_file, index=False) print(f"Data has been written to {output_file}")
# Main function to execute the workflow def main(): # Step 1: Read text from the file ML.txt with open('ML.txt', 'r') as file: text = file.read()
# Step 2: Extract modules and versions modules = extract_modules_and_versions(text)
# Step 3: Fetch information from mvnrepository.com and create list of module information modules_info = [] for module, version in modules: release_date, homepage, vulnerabilities, categories, description = get_mvnrepository_info(module, version)
# Skip recording if categories match specified ones if categories in ["Testing Frameworks & Tools", "Logging Frameworks"]: print(f"Skipping module {module} due to category: {categories}") continue
# Execute the main function if __name__ == "__main__": main() [/code] Мой текстовый файл выглядит следующим образом: [code]robolectricVersion = "4.8" test-robolectric = { module = "org.robolectric:robolectric", version.ref = "robolectricVersion"} [/code] В конечном итоге я получаю следующее:
release_date: 17 октября 2024 г.
Вместо 3 мая 2022 г. Даже после распечатки таблицы указано 17 октября 2024 г. Есть идеи, почему это происходит?< /p> Заранее спасибо!!
Я подумываю о создании веб-сайта, подобного этому. Моя цель — настроить систему, которая автоматически извлекает изображения и тексты из пакета папок. Каждая папка будет представлять одну запись в HTML-файле «портфолио». Каждый файл в каждой папке...
Я новичок в Python, создал этот скрипт Python через Chat GPT и просматриваю несколько вопросов на этом сайте. Он работает с файлами, которые у меня есть, но он очень жестко запрограммирован для этих конкретных текстовых файлов. Как я могу сделать...
Я довольно новичок в Java, и я столкнулся с ошибкой, которую я просто не могу выяснить! (или любая команда ./mvnw на самом деле!) Я сталкиваюсь с следующей ошибкой:
Exception in thread main java.lang.IllegalStateException:...
Я довольно новичок в Java, и я столкнулся с ошибкой, которую я просто не могу выяснить! (или любая команда ./mvnw на самом деле!) Я сталкиваюсь с следующей ошибкой:
Exception in thread main java.lang.IllegalStateException:...
Я довольно новичок в Java, и я столкнулся с ошибкой, которую я просто не могу выяснить! (или любая команда ./mvnw на самом деле!) Я сталкиваюсь с следующей ошибкой:
Exception in thread main java.lang.IllegalStateException:...