#!/usr/bin/env python3 """ Boletín Oficial → CSV para importar en WordPress Extrae avisos de un archivo .doc o .docx y genera un CSV listo para importar con el plugin "Really Simple CSV Importer". Uso: python boletin_to_csv.py BO29.doc python boletin_to_csv.py BO29.docx """ import sys import os import csv import re import shutil import tempfile import subprocess from pathlib import Path from collections import Counter # Categorías que llevan etiqueta del año CATS_CON_ETIQUETA = { "Leyes", "Decretos", "Resoluciones", "Decretos Municipales", "Ordenanzas", "Resoluciones Municipales", "Partidos Políticos", } PDF_IMG_URL = "https://boletinoficial.jujuy.gob.ar/wp-content/uploads/2016/10/PDFDownload.png" PDF_BASE_URL = "https://boletinoficial.jujuy.gob.ar/wp-content/uploads/2016/Boletines/{anio}/{numero}-{anio}.pdf" def armar_bloque_pdf(numero, anio): """Genera el HTML del ícono PDF con link al boletín.""" url = PDF_BASE_URL.format(anio=anio, numero=numero) return ( f'

' f'' f'' f'' f'

' ) def encontrar_libreoffice(): rutas_windows = [ r"C:\Program Files\LibreOffice\program\soffice.exe", r"C:\Program Files (x86)\LibreOffice\program\soffice.exe", ] for ruta in rutas_windows: if os.path.exists(ruta): return ruta return shutil.which("soffice") def convertir_doc_a_docx(doc_path): soffice = encontrar_libreoffice() if not soffice: print("ERROR: LibreOffice no encontrado.") print("Instalalo desde https://www.libreoffice.org y reiniciá la computadora.") print("O convertí el archivo a .docx desde Word manualmente.") sys.exit(1) tmpdir = tempfile.mkdtemp() try: result = subprocess.run( [soffice, "--headless", "--convert-to", "docx", "--outdir", tmpdir, doc_path], capture_output=True, text=True ) if result.returncode != 0: print(f"ERROR al convertir: {result.stderr}") sys.exit(1) out_name = Path(doc_path).stem + ".docx" final = os.path.join(tempfile.gettempdir(), out_name) shutil.copy(os.path.join(tmpdir, out_name), final) return final finally: shutil.rmtree(tmpdir, ignore_errors=True) def clasificar_aviso(first, lines): """Clasifica un aviso según la primera línea.""" body = " ".join(lines).upper() if first.startswith("LEY"): return "Leyes" if first.startswith("DECRETO") and "MUNICIPAL" not in first: return "Decretos" if first.startswith("DECRETO") and "MUNICIPAL" in first: return "Decretos Municipales" if first.startswith("ORDENANZA"): return "Ordenanzas" if (first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN")) and "MUNICIPAL" in first: return "Resoluciones Municipales" if first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN") or "INSTITUTO DE VIVIENDA" in first: return "Resoluciones" if "MUNICIPALIDAD" in first or "MUNICIPIO" in first or "COMISION MUNICIPAL" in first or "COMISIÓN MUNICIPAL" in first: return "Decretos Municipales" if "PARTIDO" in first and "POLITIC" in first: return "Partidos Políticos" if "LICITACION" in first or "CONCURSO DE PRECIO" in first: return "Licitaciones" if "REMATE" in first: return "Remates" if "QUIEBRA" in first or ("CONCURSO" in first and "PRECIO" not in first): return "Concursos y Quiebras" if "MINAS" in first or "JUEZ ADMINISTRATIVO DE MINAS" in body: return "Minas" if "USUCAP" in first: return "Usucapión" if "NOTIFICACI" in first: return "Notificaciones" if "CITACI" in first: return "Citación" if "SUCESORIO" in first or "TRIBUNAL DE FAMILIA" in body or "JUZGADO" in first: return "Sucesorios" if (first.startswith("ACTA") or first.startswith("CONTRATO") or first.startswith("CESION") or first.startswith("ESCRITURA") or "INSTRUMENTO CONSTITUTIVO" in first or "DECLARACION JURADA" in first or "DECLARACIÓN JURADA" in first or "ANEXO" in first): return "Contratos" return "Sin clasificar" def extraer_avisos(docx_path, titulo_boletin, anio, numero, bloque_pdf): from docx import Document doc = Document(docx_path) # Separadores: párrafos con borde inferior sep_indices = [] for i, para in enumerate(doc.paragraphs): if "pBdr" in para._element.xml and para.text.strip(): sep_indices.append(i) blocks = [] prev = 0 for sep_i in sep_indices: block = [doc.paragraphs[j].text.strip() for j in range(prev, sep_i + 1) if doc.paragraphs[j].text.strip()] if block: blocks.append(block) prev = sep_i + 1 last = [doc.paragraphs[j].text.strip() for j in range(prev, len(doc.paragraphs)) if doc.paragraphs[j].text.strip()] if last: blocks.append(last) avisos = [] for lines in blocks: first = lines[0].upper() category = clasificar_aviso(first, lines) etiqueta = anio if category in CATS_CON_ETIQUETA else "" # Contenido: primero el ícono PDF, luego el texto html_lines = "\n".join(f"

{line}

" for line in lines if line.strip()) html_content = bloque_pdf + "\n" + html_lines avisos.append({ "post_title": titulo_boletin, "post_content": html_content, "post_status": "publish", "post_type": "post", "tax_category": category, "tax_post_tag": etiqueta, }) return avisos def main(): try: from docx import Document except ImportError: print("Instalando python-docx...") subprocess.run([sys.executable, "-m", "pip", "install", "python-docx", "-q"]) if len(sys.argv) < 2: print("Uso: python boletin_to_csv.py archivo.doc") print(" python boletin_to_csv.py archivo.docx") sys.exit(1) archivo = sys.argv[1] if archivo.lower().endswith(".doc"): print(f"Convirtiendo {archivo} a .docx con LibreOffice...") archivo = convertir_doc_a_docx(archivo) print(f" → Listo\n") elif not archivo.lower().endswith(".docx"): print("ERROR: El archivo debe ser .doc o .docx") sys.exit(1) print("─" * 55) titulo = input("Título del boletín (ej: BOLETIN OFICIAL Nº 29 – 09/03/2026): ").strip() if not titulo: print("ERROR: El título no puede estar vacío.") sys.exit(1) # Extraer número y año del título automáticamente match_num = re.search(r'N[ºo°]?\s*(\d+)', titulo, re.IGNORECASE) match_anio = re.search(r'\b(20\d{2})\b', titulo) numero = match_num.group(1) if match_num else "" anio = match_anio.group(1) if match_anio else "" if anio: print(f" → Año detectado: {anio}") if numero: print(f" → Número de boletín: {numero}") pdf_url = PDF_BASE_URL.format(anio=anio, numero=numero) print(f" → Link PDF: {pdf_url}") else: print(" ⚠ No se pudo detectar el número de boletín — el ícono PDF no se agregará") print("─" * 55) print() bloque_pdf = armar_bloque_pdf(numero, anio) if numero and anio else "" salida = Path(sys.argv[1]).stem + "_avisos.csv" print("Extrayendo avisos...") avisos = extraer_avisos(archivo, titulo, anio, numero, bloque_pdf) print(f" → {len(avisos)} avisos encontrados") resumen = Counter(a["tax_category"] for a in avisos) print("\nResumen por categoría:") for cat, n in sorted(resumen.items()): print(f" {n:3d} {cat}") campos = ["post_title", "post_content", "post_status", "post_type", "tax_category", "tax_post_tag"] with open(salida, "w", newline="", encoding="utf-8-sig") as f: writer = csv.DictWriter(f, fieldnames=campos) writer.writeheader() writer.writerows(avisos) print(f"\n✓ CSV generado: {salida}") print(f" {len(avisos)} entradas listas para importar en WordPress") print() print("Próximo paso: Herramientas → Import CSV en WordPress") if __name__ == "__main__": main()