#!/usr/bin/env python3 """ Boletín Oficial → CSV para importar en WordPress Extrae avisos de un archivo .doc o .docx y genera un CSV listo para importar con el plugin "Really Simple CSV Importer". Uso: python boletin_to_csv.py BO29.doc python boletin_to_csv.py BO29.docx """ import sys import os import csv import re import shutil import tempfile import subprocess from pathlib import Path from collections import Counter # Categorías que llevan etiqueta del año CATS_CON_ETIQUETA = { "Leyes", "Acuerdo Legislativo", "Decretos", "Resoluciones", "Decretos Municipales", "Ordenanzas", "Resoluciones Municipales", "Partidos Políticos", } PDF_IMG_URL = "https://boletinoficial.jujuy.gob.ar/wp-content/uploads/2016/10/PDFDownload.png" PDF_BASE_URL = "https://boletinoficial.jujuy.gob.ar/wp-content/uploads/2016/Boletines/{anio}/{numero}-{anio}.pdf" def armar_bloque_pdf(numero, anio): """Genera el HTML del ícono PDF con link al boletín.""" url = PDF_BASE_URL.format(anio=anio, numero=numero) return ( f'

' f'' f'' f'' f'

' ) def encontrar_libreoffice(): rutas_windows = [ r"C:\Program Files\LibreOffice\program\soffice.exe", r"C:\Program Files (x86)\LibreOffice\program\soffice.exe", ] for ruta in rutas_windows: if os.path.exists(ruta): return ruta return shutil.which("soffice") def convertir_doc_a_docx(doc_path): soffice = encontrar_libreoffice() if not soffice: print("ERROR: LibreOffice no encontrado.") print("Instalalo desde https://www.libreoffice.org y reiniciá la computadora.") print("O convertí el archivo a .docx desde Word manualmente.") sys.exit(1) tmpdir = tempfile.mkdtemp() try: result = subprocess.run( [soffice, "--headless", "--convert-to", "docx", "--outdir", tmpdir, doc_path], capture_output=True, text=True ) if result.returncode != 0: print(f"ERROR al convertir: {result.stderr}") sys.exit(1) out_name = Path(doc_path).stem + ".docx" final = os.path.join(tempfile.gettempdir(), out_name) shutil.copy(os.path.join(tmpdir, out_name), final) return final finally: shutil.rmtree(tmpdir, ignore_errors=True) def es_encabezado_categoria(text): """Devuelve el marcador de categoria si el texto es un encabezado de seccion del boletin, o None si no lo es.""" t = text.strip().upper() # Recuadros a ignorar if t == "FE DE ERRATAS": return "_IGNORAR_" # Secciones que se sub-clasifican aviso por aviso if t == "LEYES - DECRETOS - RESOLUCIONES": return "_LEYES_DECRETOS_RESOLUCIONES_" if t == "MUNICIPIOS - COMISIONES MUNICIPALES": return "_MUNICIPIOS_" # Secciones con categoria directa en WordPress MAPA = { "PARTIDOS POLITICOS": "Partidos Políticos", "PARTIDOS POLÍTICOS": "Partidos Políticos", "LICITACIONES - CONCURSO DE PRECIOS": "Licitaciones", "CONTRATOS - CONVOCATORIAS - ACTAS": "Contratos", "REMATES": "Remates", "CONCURSOS Y QUIEBRAS": "Concursos y Quiebras", "EDICTOS DE MINAS": "Minas", "EDICTOS DE USUCAPION": "Usucapión", "EDICTOS DE USUCAPIÓN": "Usucapión", "EDICTOS DE NOTIFICACION": "Notificaciones", "EDICTOS DE NOTIFICACIÓN": "Notificaciones", "EDICTOS DE CITACION": "Citación", "EDICTOS DE CITACIÓN": "Citación", "EDICTOS SUCESORIOS": "Sucesorios", } return MAPA.get(t, None) def extraer_avisos(docx_path, titulo_boletin, anio, numero, bloque_pdf): from docx import Document from lxml import etree import zipfile # ── 1. Leer el body XML para mapear posición → categoría del txbx ── with zipfile.ZipFile(docx_path) as z: xml_raw = z.read("word/document.xml") tree = etree.fromstring(xml_raw) W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" WPS = "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" body = tree.find(f"{{{W}}}body") body_children = list(body) # Mapeo: índice en body_children → nombre de categoría cat_por_child = {} for idx, child in enumerate(body_children): txboxes = child.findall(f".//{{{WPS}}}txbx") for txbx in txboxes: texts = txbx.findall(f".//{{{W}}}t") content = " ".join(t.text for t in texts if t.text).strip() cat = es_encabezado_categoria(content) if cat: cat_por_child[idx] = cat # ── 2. Cargar el doc con python-docx para extraer párrafos ────────── doc = Document(docx_path) # python-docx expone doc.paragraphs en el mismo orden que los w:p del body. # Construimos un mapeo: índice_en_body → índice_en_doc.paragraphs # (ignoramos nodos que no son w:p, como w:tbl, w:sectPr, etc.) para_index_map = {} # body_child_idx → para_idx para_idx = 0 for body_idx, child in enumerate(body_children): tag = child.tag.split('}')[-1] if tag == 'p': para_index_map[body_idx] = para_idx para_idx += 1 # ── 3. Recorrer párrafos y asignar categoría activa ───────────────── # El recuadro de categoría puede estar anclado dentro del último bloque # de la sección anterior. Para evitar que ese bloque herede la categoría # nueva, registramos el para_idx donde aparece cada recuadro y luego # asignamos la categoría recién a partir del siguiente separador (pBdr). cat_cambios = {} # para_idx → nueva categoría (se activa en ese párrafo) for body_idx, cat in cat_por_child.items(): if body_idx in para_index_map: cat_cambios[para_index_map[body_idx]] = cat # Construir cat_por_para: la categoría se activa en el párrafo del recuadro cat_por_para = {} current_cat = "Sin clasificar" for para_idx in range(len(doc.paragraphs)): if para_idx in cat_cambios: current_cat = cat_cambios[para_idx] cat_por_para[para_idx] = current_cat # ── 4. Construir mapa numId → tipo de lista (bullet u ordered) ────── num_format_map = {} # numId → 'ol' o 'ul' try: import zipfile as zf2 from lxml import etree as et2 with zf2.ZipFile(docx_path) as z2: if "word/numbering.xml" in z2.namelist(): num_xml = z2.read("word/numbering.xml") num_tree = et2.fromstring(num_xml) abstract_nums = {} for an in num_tree.findall(f"{{{W}}}abstractNum"): an_id = an.get(f"{{{W}}}abstractNumId") lvls = {} for lvl in an.findall(f"{{{W}}}lvl"): ilvl_val = lvl.get(f"{{{W}}}ilvl") fmt = lvl.find(f"{{{W}}}numFmt") if fmt is not None: lvls[ilvl_val] = fmt.get(f"{{{W}}}val") abstract_nums[an_id] = lvls for num in num_tree.findall(f"{{{W}}}num"): num_id = num.get(f"{{{W}}}numId") ref = num.find(f"{{{W}}}abstractNumId") if ref is not None: an_id = ref.get(f"{{{W}}}val") lvls = abstract_nums.get(an_id, {}) fmt = lvls.get("0", "bullet") num_format_map[num_id] = "ol" if fmt == "decimal" else "ul" except Exception: pass # ── 5. Construir mapa de párrafos a tablas ────────────────────────── # Necesitamos saber qué tabla sigue a qué párrafo en el documento # Para eso recorremos el body y mapeamos posición de tablas tabla_por_body_idx = {} tabla_counter = 0 for body_idx, child in enumerate(body_children): if child.tag.split("}")[-1] == "tbl": tabla_por_body_idx[body_idx] = tabla_counter tabla_counter += 1 # Mapeo body_idx → tabla_idx para saber qué tablas están entre párrafos # Construimos lista ordenada de (body_idx, tipo, referencia) body_sequence = [] para_counter = 0 for body_idx, child in enumerate(body_children): tag = child.tag.split("}")[-1] if tag == "p": body_sequence.append((body_idx, "p", para_counter)) para_counter += 1 elif tag == "tbl": body_sequence.append((body_idx, "tbl", tabla_por_body_idx[body_idx])) # ── 5. Separar bloques por borde inferior ─────────────────────────── sep_indices = [] for i, para in enumerate(doc.paragraphs): if "pBdr" in para._element.xml and para.text.strip(): sep_indices.append(i) def bloque_cat(para_indices): """Devuelve la categoría del bloque usando el último párrafo como referencia.""" last_pi = para_indices[-1] return cat_por_para.get(last_pi, "Sin clasificar") def para_to_html(para): """Convierte un párrafo a HTML, detectando si es lista y su tipo.""" import re as _re text = para.text.strip() if not text: return None, None if "numPr" in para._element.xml: num_id_match = _re.search(r'{cell_text}") rows_html.append("" + "".join(cells) + "") return '' + "".join(rows_html) + "

" def build_block_html(para_indices, doc, body_sequence): """Construye el HTML de un bloque incluyendo viñetas y tablas.""" # Determinar el rango de para_idx del bloque first_pi = para_indices[0] last_pi = para_indices[-1] # Encontrar en body_sequence los elementos correspondientes html_parts = [] current_list_type = None # None, 'ul' o 'ol' for body_idx, tipo, ref in body_sequence: if tipo == "p": if ref < first_pi or ref > last_pi: continue para = doc.paragraphs[ref] text, list_type = para_to_html(para) if not text: continue if list_type: # Si cambia el tipo de lista, cerrar la anterior y abrir la nueva if current_list_type != list_type: if current_list_type: html_parts.append(f"") html_parts.append(f"<{list_type}>") current_list_type = list_type html_parts.append(f"

{text}

") else: if current_list_type: html_parts.append(f"") current_list_type = None html_parts.append(f"

{text}

") elif tipo == "tbl": prev_para = None next_para = None for seq_body_idx, seq_tipo, seq_ref in body_sequence: if seq_tipo == "p": if seq_body_idx < body_idx: prev_para = seq_ref elif seq_body_idx > body_idx and next_para is None: next_para = seq_ref break if prev_para is not None and first_pi <= prev_para <= last_pi: if current_list_type: html_parts.append(f"") current_list_type = None html_parts.append(tabla_to_html(doc.tables[ref])) if current_list_type: html_parts.append(f"") return "\n".join(html_parts) blocks = [] prev = 0 for sep_i in sep_indices: indices = list(range(prev, sep_i + 1)) lines = [doc.paragraphs[j].text.strip() for j in indices if doc.paragraphs[j].text.strip()] if lines: blocks.append((lines, indices)) prev = sep_i + 1 last_indices = list(range(prev, len(doc.paragraphs))) last_lines = [doc.paragraphs[j].text.strip() for j in last_indices if doc.paragraphs[j].text.strip()] if last_lines: blocks.append((last_lines, last_indices)) # ── 5. Sub-clasificadores por seccion ─────────────────────────────── def subclasificar_leyes_decretos(lines): first = lines[0].upper() if first.startswith("LEY"): return "Leyes" if first.startswith("ACUERDO"): return "Acuerdo Legislativo" if first.startswith("DECRETO") and "MUNICIPAL" in first: return "Decretos Municipales" if first.startswith("DECRETO"): return "Decretos" if (first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN")) and "MUNICIPAL" in first: return "Resoluciones Municipales" if first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN"): return "Resoluciones" if "INSTITUTO DE VIVIENDA" in first: return "Resoluciones" if "MUNICIPALIDAD" in first or "MUNICIPIO" in first: return "Decretos Municipales" return "Resoluciones" # fallback dentro de esta seccion def subclasificar_municipios(lines): first = lines[0].upper() body = " ".join(lines).upper() if first.startswith("ORDENANZA"): return "Ordenanzas" if first.startswith("DECRETO"): return "Decretos Municipales" if first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN"): return "Resoluciones Municipales" if "ORDENANZA" in body: return "Ordenanzas" if "RESOLUCION" in body or "RESOLUCIÓN" in body: return "Resoluciones Municipales" return "Decretos Municipales" # fallback avisos = [] for lines, indices in blocks: category = bloque_cat(indices) first = lines[0].upper() # Ignorar secciones excluidas (ej: FE DE ERRATAS) if category == "_IGNORAR_": continue # Sub-clasificar secciones mixtas if category == "_LEYES_DECRETOS_RESOLUCIONES_" or ( category == "Sin clasificar" and ( first.startswith("LEY") or first.startswith("DECRETO") or first.startswith("ACUERDO") or first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN") or "INSTITUTO DE VIVIENDA" in first)): category = subclasificar_leyes_decretos(lines) elif category == "_MUNICIPIOS_": category = subclasificar_municipios(lines) # Doble categoria para inscripciones de martilleros en Notificaciones if category == "Notificaciones" and first.startswith("INSCRIPCIÓN DE MARTILLERO"): category = "Notificaciones, Inscripciones de Martilleros" etiqueta = anio if category.split(",")[0].strip() in CATS_CON_ETIQUETA else "" html_body = build_block_html(indices, doc, body_sequence) html_content = "\n" + bloque_pdf + "\n" + html_body + "\n" avisos.append({ "post_title": titulo_boletin, "post_content": html_content, "post_status": "publish", "post_type": "post", "tax_category": category, "tax_post_tag": etiqueta, }) return avisos def main(): try: from docx import Document except ImportError: print("Instalando python-docx...") subprocess.run([sys.executable, "-m", "pip", "install", "python-docx", "-q"]) if len(sys.argv) < 2: print("Uso: python boletin_to_csv.py archivo.doc") print(" python boletin_to_csv.py archivo.docx") sys.exit(1) archivo = sys.argv[1] if archivo.lower().endswith(".doc"): print(f"Convirtiendo {archivo} a .docx con LibreOffice...") archivo = convertir_doc_a_docx(archivo) print(f" → Listo\n") elif not archivo.lower().endswith(".docx"): print("ERROR: El archivo debe ser .doc o .docx") sys.exit(1) print("─" * 55) titulo = input("Título del boletín (ej: BOLETIN OFICIAL Nº 29 – 09/03/2026): ").strip() if not titulo: print("ERROR: El título no puede estar vacío.") sys.exit(1) # Extraer número del título match_num = re.search(r'N[ºo°]?\s*(\d+)', titulo, re.IGNORECASE) numero = match_num.group(1) if match_num else "" # Extraer año: primero del título, si no del documento match_anio = re.search(r'\b(20\d{2})\b', titulo) anio = match_anio.group(1) if match_anio else "" if not anio: # Intentar extraer el año de los cuadros de texto del encabezado del documento try: from lxml import etree import zipfile as zf with zf.ZipFile(archivo) as z: xml_raw = z.read("word/document.xml") tree = etree.fromstring(xml_raw) W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" WPS = "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" body = tree.find(f"{{{W}}}body") for child in list(body)[:10]: for txbx in child.findall(f".//{{{WPS}}}txbx"): texts = txbx.findall(f".//{{{W}}}t") txt = " ".join(t.text for t in texts if t.text) m = re.search(r'\b(20\d{2})\b', txt) if m: anio = m.group(1) break if anio: break except Exception: pass if not anio: anio = input(" No se detectó el año automáticamente. Ingresalo manualmente (ej: 2026): ").strip() if anio: print(f" → Año detectado: {anio}") if numero: print(f" → Número de boletín: {numero}") pdf_url = PDF_BASE_URL.format(anio=anio, numero=numero) print(f" → Link PDF: {pdf_url}") else: print(" ⚠ No se pudo detectar el número de boletín — el ícono PDF no se agregará") print("─" * 55) print() bloque_pdf = armar_bloque_pdf(numero, anio) if numero and anio else "" salida = Path(sys.argv[1]).stem + "_avisos.csv" print("Extrayendo avisos...") avisos = extraer_avisos(archivo, titulo, anio, numero, bloque_pdf) print(f" → {len(avisos)} avisos encontrados") resumen = Counter(a["tax_category"] for a in avisos) print("\nResumen por categoría:") for cat, n in sorted(resumen.items()): print(f" {n:3d} {cat}") campos = ["post_title", "post_content", "post_status", "post_type", "tax_category", "tax_post_tag"] with open(salida, "w", newline="", encoding="utf-8-sig") as f: writer = csv.DictWriter(f, fieldnames=campos) writer.writeheader() writer.writerows(avisos) print(f"\n✓ CSV generado: {salida}") print(f" {len(avisos)} entradas listas para importar en WordPress") print() print("Próximo paso: Herramientas → Import CSV en WordPress") if __name__ == "__main__": main()