chore: initial commit
This commit is contained in:
170
converter.py
Normal file
170
converter.py
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def get_eid(elmt):
|
||||
return elmt["eId"].split("/")[-1]
|
||||
|
||||
|
||||
def register_path(elmts, elmt):
|
||||
parents = list(reversed(elmt.find_parents()))
|
||||
in_body = False
|
||||
|
||||
elmt = elmts
|
||||
path = []
|
||||
for parent in parents:
|
||||
name = parent.name
|
||||
if not in_body:
|
||||
if name == "body":
|
||||
in_body = True
|
||||
continue
|
||||
|
||||
if not name in elmt["children"]:
|
||||
elmt["children"][name] = {}
|
||||
|
||||
eid = get_eid(parent)
|
||||
if not eid in elmt["children"][name]:
|
||||
elmt["children"][name][eid] = get_meta(parent)
|
||||
|
||||
path.append((name, eid))
|
||||
elmt = elmt["children"][name][eid]
|
||||
|
||||
return path
|
||||
|
||||
|
||||
def get_meta(elmt):
|
||||
name = elmt.name
|
||||
meta = {
|
||||
"children": {}
|
||||
}
|
||||
num = elmt.find("num")
|
||||
if num:
|
||||
meta["num"] = get_text(num)
|
||||
|
||||
title = elmt.find("heading")
|
||||
if title:
|
||||
meta["heading"] = next(title.stripped_strings)
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def get_text(elmt):
|
||||
children = list(filter(lambda e: e.name in (None, "b", "i"), elmt.children))
|
||||
return "".join(c.get_text().strip() for c in children).strip()
|
||||
|
||||
|
||||
def parse_elmt(elmt):
|
||||
name = elmt.name
|
||||
|
||||
if name is None:
|
||||
return elmt.get_text().strip()
|
||||
|
||||
if name == "sup":
|
||||
return {
|
||||
"type": "sup",
|
||||
"body": list(map(parse_elmt, elmt.children))
|
||||
}
|
||||
|
||||
if name == "sub":
|
||||
return {
|
||||
"type": "sub",
|
||||
"body": list(map(parse_elmt, elmt.children))
|
||||
}
|
||||
|
||||
if name == "blockList":
|
||||
intro = parse_elmt(elmt.find("listIntroduction"))
|
||||
items = []
|
||||
|
||||
for item in elmt.find_all("item", recursive=False):
|
||||
num = get_text(item.find("num"))
|
||||
item_body = parse_elmt(item)
|
||||
items.append((num, item_body))
|
||||
|
||||
return {
|
||||
"type": "enum",
|
||||
"intro": intro,
|
||||
"items": items
|
||||
}
|
||||
|
||||
body = []
|
||||
for child in elmt.children:
|
||||
if child.name != "num":
|
||||
body.append(parse_elmt(child))
|
||||
|
||||
body = remove_empty(body)
|
||||
return body
|
||||
|
||||
|
||||
def remove_empty(body):
|
||||
body = list(filter(lambda p: not isinstance(p, str) or p.strip(), body))
|
||||
if len(body) == 1:
|
||||
return body[0]
|
||||
return body
|
||||
|
||||
|
||||
def convert(in_path, out_path):
|
||||
with open(in_path, "r") as f:
|
||||
bs = BeautifulSoup(f.read(), "xml")
|
||||
|
||||
elmts = {
|
||||
"children": {
|
||||
"article": {}
|
||||
}
|
||||
}
|
||||
for t in bs.find_all("transitional"):
|
||||
t.extract()
|
||||
|
||||
for p in bs.find_all("proviso"):
|
||||
p.extract()
|
||||
|
||||
for article in bs.find_all("article"):
|
||||
pars = []
|
||||
for par in article.find_all("paragraph"):
|
||||
num = par.find("num")
|
||||
if num is None:
|
||||
num = "1"
|
||||
else:
|
||||
num = get_text(num)
|
||||
|
||||
text = par.find("content")
|
||||
for note in text.find_all("authorialNote"):
|
||||
note.extract()
|
||||
|
||||
body = []
|
||||
|
||||
for child in text.children:
|
||||
body.append(parse_elmt(child))
|
||||
|
||||
body = remove_empty(body)
|
||||
|
||||
pars.append({
|
||||
"num": num,
|
||||
"content": body
|
||||
})
|
||||
|
||||
path = register_path(elmts, article)
|
||||
eid = get_eid(article)
|
||||
articles = elmts["children"]["article"]
|
||||
if eid not in articles:
|
||||
articles[eid] = []
|
||||
articles[eid].append({
|
||||
"num": get_text(article.num),
|
||||
"path": path,
|
||||
"children": pars
|
||||
})
|
||||
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(elmts, f)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
paths = [
|
||||
("./raw/RS-311.0-01072024-FR.xml", "./law/code_penal.json"),
|
||||
]
|
||||
|
||||
for in_path, out_path in paths:
|
||||
print(f"{in_path} -> {out_path}")
|
||||
convert(in_path, out_path)
|
||||
Reference in New Issue
Block a user