#!/usr/bin/env python3 """ Boletín Oficial → CSV para importar en WordPress Extrae avisos de un archivo .doc o .docx y genera un CSV listo para importar con el plugin "Really Simple CSV Importer". Uso: python boletin_to_csv.py BO29.doc python boletin_to_csv.py BO29.docx """ import sys import os import csv import re import shutil import tempfile import subprocess from pathlib import Path from collections import Counter # Categorías que llevan etiqueta del año CATS_CON_ETIQUETA = { "Leyes", "Acuerdo Legislativo", "Decretos", "Resoluciones", "Decretos Municipales", "Ordenanzas", "Resoluciones Municipales", "Acordada Municipal", "Partidos Políticos", } PDF_IMG_URL = "https://boletinoficial.jujuy.gob.ar/wp-content/uploads/2016/10/PDFDownload.png" PDF_BASE_URL = "https://boletinoficial.jujuy.gob.ar/wp-content/uploads/2016/Boletines/{anio}/{numero}-{anio}.pdf" def armar_bloque_pdf(numero, anio): """Genera el HTML del ícono PDF con link al boletín.""" url = PDF_BASE_URL.format(anio=anio, numero=numero) return ( f'

' f'' f'' f'' f'

' ) def encontrar_libreoffice(): rutas_windows = [ r"C:\Program Files\LibreOffice\program\soffice.exe", r"C:\Program Files (x86)\LibreOffice\program\soffice.exe", ] for ruta in rutas_windows: if os.path.exists(ruta): return ruta return shutil.which("soffice") def convertir_doc_a_docx(doc_path): soffice = encontrar_libreoffice() if not soffice: print("ERROR: LibreOffice no encontrado.") print("Instalalo desde https://www.libreoffice.org y reiniciá la computadora.") print("O convertí el archivo a .docx desde Word manualmente.") sys.exit(1) tmpdir = tempfile.mkdtemp() try: result = subprocess.run( [soffice, "--headless", "--convert-to", "docx", "--outdir", tmpdir, doc_path], capture_output=True, text=True ) if result.returncode != 0: print(f"ERROR al convertir: {result.stderr}") sys.exit(1) out_name = Path(doc_path).stem + ".docx" final = os.path.join(tempfile.gettempdir(), out_name) shutil.copy(os.path.join(tmpdir, out_name), final) return final finally: shutil.rmtree(tmpdir, ignore_errors=True) def es_encabezado_categoria(text): """Devuelve el marcador de categoria si el texto es un encabezado de seccion del boletin, o None si no lo es.""" t = text.strip().upper() # Recuadros a ignorar if t == "FE DE ERRATAS": return "_IGNORAR_" # Secciones que se sub-clasifican aviso por aviso if t == "LEYES - DECRETOS - RESOLUCIONES": return "_LEYES_DECRETOS_RESOLUCIONES_" if t == "MUNICIPIOS - COMISIONES MUNICIPALES": return "_MUNICIPIOS_" # Secciones con categoria directa en WordPress MAPA = { "PARTIDOS POLITICOS": "Partidos Políticos", "PARTIDOS POLÍTICOS": "Partidos Políticos", "LICITACIONES - CONCURSO DE PRECIOS": "Licitaciones", "CONTRATOS - CONVOCATORIAS - ACTAS": "Contratos", "REMATES": "Remates", "CONCURSOS Y QUIEBRAS": "Concursos y Quiebras", "EDICTOS DE MINAS": "Minas", "EDICTOS DE USUCAPION": "Usucapión", "EDICTOS DE USUCAPIÓN": "Usucapión", "EDICTOS DE NOTIFICACION": "Notificaciones", "EDICTOS DE NOTIFICACIÓN": "Notificaciones", "EDICTOS DE CITACION": "Citación", "EDICTOS DE CITACIÓN": "Citación", "EDICTOS SUCESORIOS": "Sucesorios", } return MAPA.get(t, None) MUNICIPIOS = [ "ABRA PAMPA", "ABRALAITE", "AGUAS CALIENTES", "ARRAYANAL", "BARRACAS", "BARRANCAS", "CAIMANCITO", "CALILEGUA", "CANGREJILLOS", "CASPALA", "CATUA", "CIENEGUILLAS", "COCHINOCA", "CORANZULI", "EL AGUILAR", "EL CARMEN", "EL CONDOR", "EL MORENO", "EL PIQUETE", "EL TALAR", "FRAILE PINTADO", "HIPOLITO YRIGOYEN", "HUACALERA", "HUMAHUACA", "LA ESPERANZA", "LA MENDIETA", "LA QUIACA", "MINA PIRQUITAS", "MONTERRICO", "PALMA SOLA", "PALPALA", "PAMPA BLANCA", "PAMPICHUELA", "PERICO", "PUESTO DEL MARQUEZ", "PUESTO VIEJO", "PURMAMARCA", "RINCONADA", "RODEITO", "ROSARIO DE RIO GRANDE", "SAN ANTONIO", "SAN FRANCISCO", "SAN PEDRO", "SANTA ANA", "SANTA CLARA", "SUSQUES", "TILCARA", "TRES CRUCES", "TUMBAYA", "UQUIA", "VINALITO", "VOLCAN", "YALA", "YAVI", "YUTO", ] # Municipios con etiqueta especial MUNICIPIO_ETIQUETA_ESPECIAL = { "SAN SALVADOR DE JUJUY": "S. S. DE JUJUY", "LIBERTADOR GENERAL SAN MARTIN": "LGSM", "LIBERTADOR GRAL SAN MARTIN": "LGSM", "LIBERTADOR GRAL. SAN MARTIN": "LGSM", "SAN FRANCISCO DE TILCARA": "TILCARA", } def detectar_municipio(lines): """Detecta el municipio en el texto del aviso y devuelve la etiqueta correspondiente.""" body = " ".join(lines).upper() # Primero verificar municipios con etiqueta especial for nombre, etiqueta in MUNICIPIO_ETIQUETA_ESPECIAL.items(): if nombre in body: return etiqueta # Luego buscar en la lista general (ordenar por longitud desc para evitar matches parciales) for municipio in sorted(MUNICIPIOS, key=len, reverse=True): if municipio in body: return municipio return "" def extraer_avisos(docx_path, titulo_boletin, anio, numero, bloque_pdf): from docx import Document from lxml import etree import zipfile # ── 1. Leer el body XML para mapear posición → categoría del txbx ── with zipfile.ZipFile(docx_path) as z: xml_raw = z.read("word/document.xml") tree = etree.fromstring(xml_raw) W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" WPS = "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" body = tree.find(f"{{{W}}}body") body_children = list(body) # Mapeo: índice en body_children → nombre de categoría cat_por_child = {} for idx, child in enumerate(body_children): txboxes = child.findall(f".//{{{WPS}}}txbx") for txbx in txboxes: texts = txbx.findall(f".//{{{W}}}t") content = " ".join(t.text for t in texts if t.text).strip() cat = es_encabezado_categoria(content) if cat: cat_por_child[idx] = cat # ── 2. Cargar el doc con python-docx para extraer párrafos ────────── doc = Document(docx_path) # python-docx expone doc.paragraphs en el mismo orden que los w:p del body. # Construimos un mapeo: índice_en_body → índice_en_doc.paragraphs # (ignoramos nodos que no son w:p, como w:tbl, w:sectPr, etc.) para_index_map = {} # body_child_idx → para_idx para_idx = 0 for body_idx, child in enumerate(body_children): tag = child.tag.split('}')[-1] if tag == 'p': para_index_map[body_idx] = para_idx para_idx += 1 # ── 3. Recorrer párrafos y asignar categoría activa ───────────────── # El recuadro de categoría puede estar anclado dentro del último bloque # de la sección anterior. Para evitar que ese bloque herede la categoría # nueva, registramos el para_idx donde aparece cada recuadro y luego # asignamos la categoría recién a partir del siguiente separador (pBdr). cat_cambios = {} # para_idx → nueva categoría (se activa en ese párrafo) for body_idx, cat in cat_por_child.items(): if body_idx in para_index_map: cat_cambios[para_index_map[body_idx]] = cat # Construir cat_por_para: la categoría se activa en el párrafo del recuadro cat_por_para = {} current_cat = "Sin clasificar" for para_idx in range(len(doc.paragraphs)): if para_idx in cat_cambios: current_cat = cat_cambios[para_idx] cat_por_para[para_idx] = current_cat # ── 4. Construir mapa numId → tipo de lista (bullet u ordered) ────── num_format_map = {} # numId → 'ol' o 'ul' try: import zipfile as zf2 from lxml import etree as et2 with zf2.ZipFile(docx_path) as z2: if "word/numbering.xml" in z2.namelist(): num_xml = z2.read("word/numbering.xml") num_tree = et2.fromstring(num_xml) abstract_nums = {} for an in num_tree.findall(f"{{{W}}}abstractNum"): an_id = an.get(f"{{{W}}}abstractNumId") lvls = {} for lvl in an.findall(f"{{{W}}}lvl"): ilvl_val = lvl.get(f"{{{W}}}ilvl") fmt = lvl.find(f"{{{W}}}numFmt") if fmt is not None: lvls[ilvl_val] = fmt.get(f"{{{W}}}val") abstract_nums[an_id] = lvls for num in num_tree.findall(f"{{{W}}}num"): num_id = num.get(f"{{{W}}}numId") ref = num.find(f"{{{W}}}abstractNumId") if ref is not None: an_id = ref.get(f"{{{W}}}val") lvls = abstract_nums.get(an_id, {}) fmt = lvls.get("0", "bullet") num_format_map[num_id] = "ol" if fmt == "decimal" else "ul" except Exception: pass # ── 5. Construir mapa de párrafos a tablas ────────────────────────── # Necesitamos saber qué tabla sigue a qué párrafo en el documento # Para eso recorremos el body y mapeamos posición de tablas tabla_por_body_idx = {} tabla_counter = 0 for body_idx, child in enumerate(body_children): if child.tag.split("}")[-1] == "tbl": tabla_por_body_idx[body_idx] = tabla_counter tabla_counter += 1 # Mapeo body_idx → tabla_idx para saber qué tablas están entre párrafos # Construimos lista ordenada de (body_idx, tipo, referencia) body_sequence = [] para_counter = 0 for body_idx, child in enumerate(body_children): tag = child.tag.split("}")[-1] if tag == "p": body_sequence.append((body_idx, "p", para_counter)) para_counter += 1 elif tag == "tbl": body_sequence.append((body_idx, "tbl", tabla_por_body_idx[body_idx])) # ── 5. Separar bloques por borde inferior ─────────────────────────── sep_indices = [] for i, para in enumerate(doc.paragraphs): if "pBdr" in para._element.xml: sep_indices.append(i) def bloque_cat(para_indices): """Devuelve la categoría del bloque usando el último párrafo como referencia.""" last_pi = para_indices[-1] return cat_por_para.get(last_pi, "Sin clasificar") def para_to_html(para): """Convierte un párrafo a HTML, respetando negrita y subrayado por run.""" import re as _re if not para.text.strip(): return None, None # Construir el texto HTML respetando formato de cada run html_text = "" for run in para.runs: t = run.text if not t: continue if run.bold and run.underline: t = f"{t}" elif run.bold: t = f"{t}" elif run.underline: t = f"{t}" html_text += t html_text = html_text.strip() if not html_text: return None, None if "numPr" in para._element.xml: num_id_match = _re.search(r'{cell_text}") rows_html.append("" + "".join(cells) + "") return '' + "".join(rows_html) + "

" def build_block_html(para_indices, doc, body_sequence): """Construye el HTML de un bloque incluyendo viñetas y tablas.""" # Determinar el rango de para_idx del bloque first_pi = para_indices[0] last_pi = para_indices[-1] # Encontrar en body_sequence los elementos correspondientes html_parts = [] current_list_type = None # None, 'ul' o 'ol' for body_idx, tipo, ref in body_sequence: if tipo == "p": if ref < first_pi or ref > last_pi: continue para = doc.paragraphs[ref] text, list_type = para_to_html(para) if not text: continue if list_type: # Si cambia el tipo de lista, cerrar la anterior y abrir la nueva if current_list_type != list_type: if current_list_type: html_parts.append(f"") html_parts.append(f"<{list_type}>") current_list_type = list_type html_parts.append(f"

{text}

") else: if current_list_type: html_parts.append(f"") current_list_type = None html_parts.append(f"

{text}

") elif tipo == "tbl": prev_para = None next_para = None for seq_body_idx, seq_tipo, seq_ref in body_sequence: if seq_tipo == "p": if seq_body_idx < body_idx: prev_para = seq_ref elif seq_body_idx > body_idx and next_para is None: next_para = seq_ref break if prev_para is not None and first_pi <= prev_para <= last_pi: if current_list_type: html_parts.append(f"") current_list_type = None html_parts.append(tabla_to_html(doc.tables[ref])) if current_list_type: html_parts.append(f"") return "\n".join(html_parts) blocks = [] prev = 0 for sep_i in sep_indices: indices = list(range(prev, sep_i + 1)) # Excluir el separador mismo si está vacío (solo contar párrafos con texto antes del sep) lines = [doc.paragraphs[j].text.strip() for j in indices[:-1] if doc.paragraphs[j].text.strip()] # Agregar el texto del separador si lo tiene (línea de liquidación) sep_text = doc.paragraphs[sep_i].text.strip() if sep_text: lines.append(sep_text) # Solo agregar si hay contenido real (al menos 1 línea que no sea solo # la línea de liquidación). Evita bloques fantasma entre separadores consecutivos. content_lines = [l for l in lines if l and not ( l == sep_text and len(lines) == 1 )] if content_lines: blocks.append((lines, indices)) prev = sep_i + 1 last_indices = list(range(prev, len(doc.paragraphs))) last_lines = [doc.paragraphs[j].text.strip() for j in last_indices if doc.paragraphs[j].text.strip()] if last_lines: blocks.append((last_lines, last_indices)) # ── 5. Sub-clasificadores por seccion ─────────────────────────────── def subclasificar_leyes_decretos(lines): first = lines[0].upper() third = lines[2].upper() if len(lines) > 2 else "" # Leyes y Acuerdos Legislativos: primera línea es "LEGISLATURA DE JUJUY" if "LEGISLATURA DE JUJUY" in first: if third.startswith("LEY"): return "Leyes" if third.startswith("ACUERDO"): return "Acuerdo Legislativo" if first.startswith("LEY"): return "Leyes" if first.startswith("ACUERDO"): return "Acuerdo Legislativo" if first.startswith("DECRETO") and "MUNICIPAL" in first: return "Decretos Municipales" if first.startswith("DECRETO"): return "Decretos" if (first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN")) and "MUNICIPAL" in first: return "Resoluciones Municipales" if first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN"): return "Resoluciones" if "INSTITUTO DE VIVIENDA" in first: return "Resoluciones" if "MUNICIPALIDAD" in first or "MUNICIPIO" in first: return "Decretos Municipales" return "Resoluciones" # fallback dentro de esta seccion def subclasificar_municipios(lines): first = lines[0].upper() second = lines[1].upper() if len(lines) > 1 else "" body = " ".join(lines).upper() # Acordada Municipal: detectar por frase clave en el cuerpo del aviso if "SANCIONA CON FUERZA DE ACUERDO" in body: return "Acordada Municipal" # Prioridad a la segunda línea que suele tener el tipo exacto del aviso if second.startswith("DECRETO"): return "Decretos Municipales" if second.startswith("ORDENANZA"): return "Ordenanzas" if second.startswith("RESOLUCION") or second.startswith("RESOLUCIÓN"): return "Resoluciones Municipales" if second.startswith("ACUERDO"): return "Acordada Municipal" # Luego la primera línea if first.startswith("ORDENANZA"): return "Ordenanzas" if first.startswith("DECRETO"): return "Decretos Municipales" if first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN"): return "Resoluciones Municipales" # Fallback por contenido del body — solo si la segunda línea no fue suficiente if "EL INTENDENTE" in body and "DECRETA" in body: return "Decretos Municipales" if "SANCIONA LA ORDENANZA" in body or "PROMULGASE" in body and "ORDENANZA" in body: return "Ordenanzas" if "RESOLUCION" in body or "RESOLUCIÓN" in body: return "Resoluciones Municipales" return "Decretos Municipales" # fallback avisos = [] for lines, indices in blocks: category = bloque_cat(indices) first = lines[0].upper() # Ignorar secciones excluidas (ej: FE DE ERRATAS) if category == "_IGNORAR_": continue # Sub-clasificar secciones mixtas if category == "_LEYES_DECRETOS_RESOLUCIONES_" or ( category == "Sin clasificar" and ( first.startswith("LEY") or first.startswith("DECRETO") or first.startswith("ACUERDO") or first.startswith("RESOLUCION") or first.startswith("RESOLUCIÓN") or "INSTITUTO DE VIVIENDA" in first)): category = subclasificar_leyes_decretos(lines) elif category == "_MUNICIPIOS_": category = subclasificar_municipios(lines) # Doble categoria para inscripciones de martilleros en Notificaciones if category == "Notificaciones" and first.startswith("INSCRIPCIÓN DE MARTILLERO"): category = "Notificaciones, Inscripciones de Martilleros" etiqueta = anio if category.split(",")[0].strip() in CATS_CON_ETIQUETA else "" # Etiqueta de municipio para avisos municipales if category in ("Acordada Municipal", "Decretos Municipales", "Ordenanzas", "Resoluciones Municipales"): etiqueta_municipio = detectar_municipio(lines) if etiqueta_municipio: etiqueta = f"{etiqueta}, {etiqueta_municipio}" if etiqueta else etiqueta_municipio # Agregar código de organismo como etiqueta para Decretos y Resoluciones # Un guion: ej. 1551-ISPTyV/2025 → etiqueta = ISPTyV # Resolución conjunta: ej. 231-DEyP-E/2024 → etiquetas = DEyP, E if category in ("Decretos", "Resoluciones") and lines: import re as _re m = _re.match(r'(DECRETO|RESOLUCIÓN?|RESOLUCION)(\s+\w+)?\s+N[ºo°°]?\s*\d+-([^/]+)/', lines[0].strip(), _re.IGNORECASE) if m: organismos_raw = m.group(3).strip() partes = [o.strip() for o in organismos_raw.split("-") if o.strip()] # Si hay un solo segmento → organismo simple (ej. ISPTyV) # Si hay múltiples → resolución conjunta, todos son organismos # Excepción: si algún segmento tiene más de 3 partes tipo "JUJ" seguido de "GOB" # lo descartamos porque es un identificador de expediente, no organismos # Regla: solo tomar como conjunta si la primera línea dice "CONJUNTA" es_conjunta = "CONJUNTA" in lines[0].upper() if es_conjunta: organismos = partes elif len(partes) == 1: organismos = partes else: # Múltiples guiones pero no conjunta → ignorar organismos = [] if organismos: etiqueta = f"{etiqueta}, {', '.join(organismos)}" if etiqueta else ', '.join(organismos) html_body = build_block_html(indices, doc, body_sequence) html_content = "\n" + bloque_pdf + "\n" + html_body + "\n" avisos.append({ "post_title": titulo_boletin, "post_content": html_content, "post_status": "publish", "post_type": "post", "tax_category": category, "tax_post_tag": etiqueta, }) return avisos def main(): try: from docx import Document except ImportError: print("Instalando python-docx...") subprocess.run([sys.executable, "-m", "pip", "install", "python-docx", "-q"]) if len(sys.argv) < 2: print("Uso: python boletin_to_csv.py archivo.doc") print(" python boletin_to_csv.py archivo.docx") sys.exit(1) archivo = sys.argv[1] if archivo.lower().endswith(".doc"): print(f"Convirtiendo {archivo} a .docx con LibreOffice...") archivo = convertir_doc_a_docx(archivo) print(f" → Listo\n") elif not archivo.lower().endswith(".docx"): print("ERROR: El archivo debe ser .doc o .docx") sys.exit(1) print("─" * 55) titulo = input("Título del boletín (ej: BOLETIN OFICIAL Nº 29 – 09/03/2026): ").strip() if not titulo: print("ERROR: El título no puede estar vacío.") sys.exit(1) # Extraer número del título, agregando EC si es edición complementaria match_num = re.search(r'N[ºo°]?\s*(\d+)', titulo, re.IGNORECASE) numero = match_num.group(1) if match_num else "" if numero and re.search(r'E\.?C\.?', titulo, re.IGNORECASE): numero = numero + "EC" # Extraer año: primero del título, si no del documento match_anio = re.search(r'\b(20\d{2})\b', titulo) anio = match_anio.group(1) if match_anio else "" if not anio: # Intentar extraer el año de los cuadros de texto del encabezado del documento try: from lxml import etree import zipfile as zf with zf.ZipFile(archivo) as z: xml_raw = z.read("word/document.xml") tree = etree.fromstring(xml_raw) W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" WPS = "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" body = tree.find(f"{{{W}}}body") for child in list(body)[:10]: for txbx in child.findall(f".//{{{WPS}}}txbx"): texts = txbx.findall(f".//{{{W}}}t") txt = " ".join(t.text for t in texts if t.text) m = re.search(r'\b(20\d{2})\b', txt) if m: anio = m.group(1) break if anio: break except Exception: pass if not anio: anio = input(" No se detectó el año automáticamente. Ingresalo manualmente (ej: 2026): ").strip() if anio: print(f" → Año detectado: {anio}") if numero: print(f" → Número de boletín: {numero}") pdf_url = PDF_BASE_URL.format(anio=anio, numero=numero) print(f" → Link PDF: {pdf_url}") else: print(" ⚠ No se pudo detectar el número de boletín — el ícono PDF no se agregará") print("─" * 55) print() bloque_pdf = armar_bloque_pdf(numero, anio) if numero and anio else "" salida = Path(sys.argv[1]).stem + "_avisos.csv" print("Extrayendo avisos...") avisos = extraer_avisos(archivo, titulo, anio, numero, bloque_pdf) print(f" → {len(avisos)} avisos encontrados") resumen = Counter(a["tax_category"] for a in avisos) print("\nResumen por categoría:") for cat, n in sorted(resumen.items()): print(f" {n:3d} {cat}") campos = ["post_title", "post_content", "post_status", "post_type", "tax_category", "tax_post_tag"] # Preguntar si dividir en lotes print() resp = input("¿Dividir el CSV en lotes para importar? (s/n): ").strip().lower() if resp == "s": try: tam = int(input("¿Cuántas entradas por lote? (ej: 20): ").strip()) except ValueError: tam = 20 lotes = [avisos[i:i+tam] for i in range(0, len(avisos), tam)] stem = Path(sys.argv[1]).stem for idx, lote in enumerate(lotes, 1): nombre_lote = f"{stem}_parte{idx}_avisos.csv" with open(nombre_lote, "w", newline="", encoding="utf-8-sig") as f: writer = csv.DictWriter(f, fieldnames=campos) writer.writeheader() writer.writerows(lote) print(f" ✓ {nombre_lote} ({len(lote)} entradas)") print(f"\n✓ {len(lotes)} archivos generados — importalos de a uno en WordPress") else: with open(salida, "w", newline="", encoding="utf-8-sig") as f: writer = csv.DictWriter(f, fieldnames=campos) writer.writeheader() writer.writerows(avisos) print(f"\n✓ CSV generado: {salida}") print(f" {len(avisos)} entradas listas para importar en WordPress") print() print("Próximo paso: Herramientas → Import CSV en WordPress") if __name__ == "__main__": main()