#!/usr/bin/env python # -*- coding: utf-8 -*- import json from bs4 import BeautifulSoup def get_eid(elmt): return elmt["eId"].split("/")[-1] def register_path(elmts, elmt): parents = list(reversed(elmt.find_parents())) in_body = False elmt = elmts path = [] for parent in parents: name = parent.name if not in_body: if name == "body": in_body = True continue if not name in elmt["children"]: elmt["children"][name] = {} eid = get_eid(parent) if not eid in elmt["children"][name]: elmt["children"][name][eid] = get_meta(parent) path.append((name, eid)) elmt = elmt["children"][name][eid] return path def get_meta(elmt): name = elmt.name meta = { "children": {} } num = elmt.find("num") if num: meta["num"] = get_text(num) title = elmt.find("heading") if title: meta["heading"] = next(title.stripped_strings) return meta def get_text(elmt): children = list(filter(lambda e: e.name in (None, "b", "i"), elmt.children)) return "".join(c.get_text().strip() for c in children).strip() def parse_elmt(elmt): name = elmt.name if name is None: return elmt.get_text().strip() if name == "sup": return { "type": "sup", "body": list(map(parse_elmt, elmt.children)) } if name == "sub": return { "type": "sub", "body": list(map(parse_elmt, elmt.children)) } if name == "blockList": intro = parse_elmt(elmt.find("listIntroduction")) items = [] for item in elmt.find_all("item", recursive=False): num = get_text(item.find("num")) item_body = parse_elmt(item) items.append((num, item_body)) return { "type": "enum", "intro": intro, "items": items } body = [] for child in elmt.children: if child.name != "num": body.append(parse_elmt(child)) body = remove_empty(body) return body def remove_empty(body): body = list(filter(lambda p: not isinstance(p, str) or p.strip(), body)) if len(body) == 1: return body[0] return body def convert(in_path, out_path): with open(in_path, "r") as f: bs = BeautifulSoup(f.read(), "xml") elmts = { "children": { "article": {} } } for t in bs.find_all("transitional"): t.extract() for p in bs.find_all("proviso"): p.extract() for article in bs.find_all("article"): pars = [] for par in article.find_all("paragraph"): num = par.find("num") if num is None: num = "1" else: num = get_text(num) text = par.find("content") for note in text.find_all("authorialNote"): note.extract() body = [] for child in text.children: body.append(parse_elmt(child)) body = remove_empty(body) pars.append({ "num": num, "content": body }) path = register_path(elmts, article) eid = get_eid(article) articles = elmts["children"]["article"] if eid not in articles: articles[eid] = [] articles[eid].append({ "num": get_text(article.num), "path": path, "children": pars }) with open(out_path, "w") as f: json.dump(elmts, f) if __name__ == "__main__": paths = [ ("./raw/RS-311.0-01072024-FR.xml", "./law/code_penal.json"), ] for in_path, out_path in paths: print(f"{in_path} -> {out_path}") convert(in_path, out_path)