fabulous-fedlex/converter.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json

from bs4 import BeautifulSoup


def get_eid(elmt):
    return elmt["eId"].split("/")[-1]


def register_path(elmts, elmt):
    parents = list(reversed(elmt.find_parents()))
    in_body = False

    elmt = elmts
    path = []
    for parent in parents:
        name = parent.name
        if not in_body:
            if name == "body":
                in_body = True
            continue

        if not name in elmt["children"]:
            elmt["children"][name] = {}

        eid = get_eid(parent)
        if not eid in elmt["children"][name]:
            elmt["children"][name][eid] = get_meta(parent)

        path.append((name, eid))
        elmt = elmt["children"][name][eid]

    return path


def get_meta(elmt):
    name = elmt.name
    meta = {
        "children": {}
    }
    num = elmt.find("num")
    if num:
        meta["num"] = get_text(num)

    title = elmt.find("heading")
    if title:
        meta["heading"] = next(title.stripped_strings)

    return meta


def get_text(elmt):
    children = list(filter(lambda e: e.name in (None, "b", "i"), elmt.children))
    return "".join(c.get_text().strip() for c in children).strip()


def parse_elmt(elmt):
    name = elmt.name

    if name is None:
        return elmt.get_text().strip()

    if name == "sup":
        return {
            "type": "sup",
            "body": list(map(parse_elmt, elmt.children))
        }

    if name == "sub":
        return {
            "type": "sub",
            "body": list(map(parse_elmt, elmt.children))
        }

    if name == "blockList":
        intro = parse_elmt(elmt.find("listIntroduction"))
        items = []

        for item in elmt.find_all("item", recursive=False):
            num = get_text(item.find("num"))
            item_body = parse_elmt(item)
            items.append((num, item_body))

        return {
            "type": "enum",
            "intro": intro,
            "items": items
        }

    body = []
    for child in elmt.children:
        if child.name != "num":
            body.append(parse_elmt(child))

    body = remove_empty(body)
    return body


def remove_empty(body):
    body = list(filter(lambda p: not isinstance(p, str) or p.strip(), body))
    if len(body) == 1:
        return body[0]
    return body


def convert(in_path, out_path):
    with open(in_path, "r") as f:
        bs = BeautifulSoup(f.read(), "xml")

    elmts = {
        "children": {
            "article": {}
        }
    }
    for t in bs.find_all("transitional"):
        t.extract()

    for p in bs.find_all("proviso"):
        p.extract()

    for article in bs.find_all("article"):
        pars = []
        for par in article.find_all("paragraph"):
            num = par.find("num")
            if num is None:
                num = "1"
            else:
                num = get_text(num)

            text = par.find("content")
            for note in text.find_all("authorialNote"):
                note.extract()

            body = []

            for child in text.children:
                body.append(parse_elmt(child))

            body = remove_empty(body)

            pars.append({
                "num": num,
                "content": body
            })

        path = register_path(elmts, article)
        eid = get_eid(article)
        articles = elmts["children"]["article"]
        if eid not in articles:
            articles[eid] = []
        articles[eid].append({
            "num": get_text(article.num),
            "path": path,
            "children": pars
        })

    with open(out_path, "w") as f:
        json.dump(elmts, f)


if __name__ == "__main__":
    paths = [
        ("./raw/RS-311.0-01072024-FR.xml", "./law/code_penal.json"),
    ]

    for in_path, out_path in paths:
        print(f"{in_path} -> {out_path}")
        convert(in_path, out_path)