Files
fabulous-fedlex/converter.py
2026-01-26 11:32:09 +01:00

171 lines
4.0 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
from bs4 import BeautifulSoup
def get_eid(elmt):
return elmt["eId"].split("/")[-1]
def register_path(elmts, elmt):
parents = list(reversed(elmt.find_parents()))
in_body = False
elmt = elmts
path = []
for parent in parents:
name = parent.name
if not in_body:
if name == "body":
in_body = True
continue
if not name in elmt["children"]:
elmt["children"][name] = {}
eid = get_eid(parent)
if not eid in elmt["children"][name]:
elmt["children"][name][eid] = get_meta(parent)
path.append((name, eid))
elmt = elmt["children"][name][eid]
return path
def get_meta(elmt):
name = elmt.name
meta = {
"children": {}
}
num = elmt.find("num")
if num:
meta["num"] = get_text(num)
title = elmt.find("heading")
if title:
meta["heading"] = next(title.stripped_strings)
return meta
def get_text(elmt):
children = list(filter(lambda e: e.name in (None, "b", "i"), elmt.children))
return "".join(c.get_text().strip() for c in children).strip()
def parse_elmt(elmt):
name = elmt.name
if name is None:
return elmt.get_text().strip()
if name == "sup":
return {
"type": "sup",
"body": list(map(parse_elmt, elmt.children))
}
if name == "sub":
return {
"type": "sub",
"body": list(map(parse_elmt, elmt.children))
}
if name == "blockList":
intro = parse_elmt(elmt.find("listIntroduction"))
items = []
for item in elmt.find_all("item", recursive=False):
num = get_text(item.find("num"))
item_body = parse_elmt(item)
items.append((num, item_body))
return {
"type": "enum",
"intro": intro,
"items": items
}
body = []
for child in elmt.children:
if child.name != "num":
body.append(parse_elmt(child))
body = remove_empty(body)
return body
def remove_empty(body):
body = list(filter(lambda p: not isinstance(p, str) or p.strip(), body))
if len(body) == 1:
return body[0]
return body
def convert(in_path, out_path):
with open(in_path, "r") as f:
bs = BeautifulSoup(f.read(), "xml")
elmts = {
"children": {
"article": {}
}
}
for t in bs.find_all("transitional"):
t.extract()
for p in bs.find_all("proviso"):
p.extract()
for article in bs.find_all("article"):
pars = []
for par in article.find_all("paragraph"):
num = par.find("num")
if num is None:
num = "1"
else:
num = get_text(num)
text = par.find("content")
for note in text.find_all("authorialNote"):
note.extract()
body = []
for child in text.children:
body.append(parse_elmt(child))
body = remove_empty(body)
pars.append({
"num": num,
"content": body
})
path = register_path(elmts, article)
eid = get_eid(article)
articles = elmts["children"]["article"]
if eid not in articles:
articles[eid] = []
articles[eid].append({
"num": get_text(article.num),
"path": path,
"children": pars
})
with open(out_path, "w") as f:
json.dump(elmts, f)
if __name__ == "__main__":
paths = [
("./raw/RS-311.0-01072024-FR.xml", "./law/code_penal.json"),
]
for in_path, out_path in paths:
print(f"{in_path} -> {out_path}")
convert(in_path, out_path)