commit a785270fda92ec283fbc80e94ee4ee03638495d6 Author: The Wobbler Date: Wed Apr 9 17:15:25 2025 +0200 Got extracting of YAML-frontmatter, headings, tags and wikilinks working. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d569ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +metadata.json +.idea \ No newline at end of file diff --git a/omg.py b/omg.py new file mode 100644 index 0000000..477dda3 --- /dev/null +++ b/omg.py @@ -0,0 +1,205 @@ +#!/usr/bin/python3 + +import os +import yaml +import json +import collections.abc + + +class OMG: + """ + OMG - Obsidian Metadata Generator + + Generates metadata for markdown files. + (Like the Obsidian-plugin "Metadata Extractor" does.) + """ + + def __init__(self, path: str | os.PathLike) -> None: + """ + Generates metadata for markdown files located at the given path. + (Like the Obsidian-plugin "Metadata Extractor" does.) + + :param PathLike path: The path where the markdown files are located + """ + + self.path = path + + if not self.path[-1] == "/": + self.path += "/" + + self.md_files = self._parse_all_files(self.path) + + def _parse_all_files(self, root: str | os.PathLike, path: str | os.PathLike = None) -> list: + if path is None: + path = root + + md_files = [] + + for filename in os.listdir(path): + if filename in [".git", ".obsidian"]: # exclude .git and .obsidian + continue + + filepath = path + filename + + if os.path.isdir(filepath): + md_files.extend(self._parse_all_files(root, filepath + "/")) # recurse into subfolders + continue + + if not filename.endswith(".md"): # only parse markdown files + continue + + file_metadata = {"fileName": filename[:-3], "relativePath": os.path.relpath(filepath, root)} + file_metadata.update(self._parse_yaml_frontmatter(filepath)) # add yaml frontmatter + + file_metadata = recursive_update(file_metadata, self._parse_md_contents(filepath)) + + md_files.append(file_metadata) + + return md_files + + def _parse_yaml_frontmatter(self, path: str | os.PathLike) -> dict: + file = open(path) + + frontmatter_header = file.read(4) + if not frontmatter_header == "---\n": # file has no frontmatter + file.close() + return {} + + frontmatter = file.read().split("\n---\n")[0] + + file.close() + + frontmatter_data = yaml.safe_load(frontmatter) + + tags_lower = [] + for tag in frontmatter_data["tags"]: + tags_lower.append(tag.lower()) + + frontmatter_data["tags"] = tags_lower + + return frontmatter_data + + def _parse_md_contents(self, path: str | os.PathLike): + file = open(path) + content = file.read() + file.close() + + file_metadata = { + "headings": [], + "tags": [], + "links": [] + } + + for line in content.split("\n"): + if line == "": + continue + + # ==== headings ==== + if line.startswith("#"): # heading or tag + tokens = line.split() + hashtags = tokens[0] + + # all chars are "#" and there is text after a space (is definitely a heading) + if len(set(hashtags)) == 1 and len(tokens) > 1: + heading_text = line[len(hashtags):].strip() + heading = { + "heading": heading_text, + "level": min(len(hashtags), 6) + } + + file_metadata["headings"].append(heading) + + # ==== tags ==== + tags = [] + + if " #" in line: + tags = line.split(" #") + tags = tags[1:] + + if line[0] == "#": + if not line[1] in ["#", " "]: + tags.append(line[2:line.find(" #")]) + + for tag in tags: + if tag[0] in [" ", "#"]: + continue + + tag = tag.strip("#") + + if " " in tag: + tag = tag.split()[0] + + if not tag == "": + file_metadata["tags"].append(tag.lower()) + + # ==== wikilinks ==== + if not "[[" in line or not "]]" in line: + continue + + links = line.split("[[") + + for link in links: + if not "]]" in link or link.startswith("#"): # link has no end or is leading to a heading + continue + + link = link.split("]]")[0] + + tokens = link.split("|") # ["relPath, "link"] + + link = tokens[0].split("/")[-1] + relpath = os.path.relpath(os.path.join(path, "../" + tokens[0]), self.path) + ".md" + + link_data = {"link": link, "relativePath": relpath} + + if len(tokens) > 1 and not tokens[1] == link: + link_data["displayText"] = tokens[1] + + file_metadata["links"].append(link_data) + + return file_metadata + + def dump(self, path: str | os.PathLike=None, indent: any=2): + if path is None: + path = self.path + "metadata.json" + + file = open(path, "w") + + json.dump(self.md_files, file, indent=indent) + + file.close() + + +def recursive_update(a: collections.abc.Mapping, b: collections.abc.Mapping) -> dict: + """ + Improvement of the builtin function ``dict.update()`` which also updates sub-dicts and lists recursively. + + :param dict a: Dict to update + :param dict b: Dict containing values to add + :return: The updated dict + """ + + for key, value in b.items(): + if isinstance(value, collections.abc.Mapping): + if key in a: + a[key] = recursive_update(a[key], value) + + else: + a[key] = value + + elif isinstance(value, list): + if key in a: + a[key].extend(value) + + else: + a[key] = value + + else: + a[key] = value + + return a + + +if __name__ == "__main__": + bla = OMG("/home/emil/Dokumente/Obsidian/Gulm") + + print(json.dumps(bla.md_files, indent=2)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4818cc5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +pyyaml \ No newline at end of file