Код: Выделить всё
import re
import os
import zipfile
from lxml import etree
for file in os.listdir(filepath):
if file.endswith('.zip') or file.endswith('.docx'):
ext = os.path.splitext(file)[1]
newfile = f"{os.path.splitext(os.path.basename(file))[0]}_new{ext}"
zippedin = zipfile.ZipFile(os.path.join(filepath, file), 'r')
recovering_parser = etree.XMLParser(recover=True)
matched_items = []
for item in zippedin.infolist():
xmltree = etree.fromstring(zippedin.read(item.filename), parser=recovering_parser)
for node in xmltree.iter(tag=etree.Element):
if re.search('XXXXXXX', str(node)) or re.search('YYYYYYYY', str(node.attrib)):
matched_items.append(item)
with zipfile.ZipFile(os.path.join(filepath, newfile), 'w') as zippedout:
for element in matched_items:
zippedout.writestr(element, zippedin.read(element.filename))
zippedin.close()
Код: Выделить всё
lxml.etree.XMLSyntaxError: xmlns: 'ABCDEFGHXXXX' is not a valid URI, line 5, column 45
Подробнее здесь: https://stackoverflow.com/questions/793 ... ed-archive
Мобильная версия