OMG/omg.py

262 lines
7.5 KiB
Python
Executable file

#!/usr/bin/python3
import os
import sys
import yaml
import json
import collections.abc
class OMG:
"""
OMG - Obsidian Metadata Generator
Generates metadata for markdown files.
(Like the Obsidian-plugin "Metadata Extractor" does.)
"""
def __init__(self, path: str | os.PathLike, include_hidden: bool=False) -> None:
"""
Generates metadata for markdown files located at the given path.
(Like the Obsidian-plugin "Metadata Extractor" does.)
:param PathLike path: The path where the markdown files are located
:param bool include_hidden: If True, hidden folders and files (dotfiles) will also get searched.
"""
self.path = path
if not self.path[-1] == "/":
self.path += "/"
self.include_hidden = include_hidden
self.md_files = self._parse_all_files(self.path)
def _parse_all_files(self, root: str | os.PathLike, path: str | os.PathLike = None) -> list:
if path is None:
path = root
md_files = []
for filename in os.listdir(path):
if filename in [".git", ".obsidian"]: # exclude .git and .obsidian
continue
if filename[0] == "." and not self.include_hidden:
continue
filepath = path + filename
if os.path.isdir(filepath):
md_files.extend(self._parse_all_files(root, filepath + "/")) # recurse into subfolders
continue
if not filename.endswith(".md"): # only parse markdown files
continue
file_metadata = {"fileName": filename[:-3], "relativePath": os.path.relpath(filepath, root)}
file_metadata.update(self._parse_yaml_frontmatter(filepath)) # add yaml frontmatter
file_metadata = recursive_update(file_metadata, self._parse_md_contents(filepath))
md_files.append(file_metadata)
return md_files
def _parse_yaml_frontmatter(self, path: str | os.PathLike) -> dict:
file = open(path)
frontmatter_header = file.read(4)
if not frontmatter_header == "---\n": # file has no frontmatter
file.close()
return {}
frontmatter = file.read().split("\n---\n")[0]
file.close()
frontmatter_data = yaml.safe_load(frontmatter)
tags_lower = []
for tag in frontmatter_data["tags"]:
tags_lower.append(tag.lower())
frontmatter_data["tags"] = tags_lower
return frontmatter_data
def _parse_md_contents(self, path: str | os.PathLike):
file = open(path)
content = file.read()
file.close()
file_metadata = {
"headings": [],
"tags": [],
"links": []
}
codeblock = False
for line in content.split("\n"):
if line == "":
continue
if "```" in line: # detect codeblocks and ignore everything inside it
if line.count("```") % 2: # odd count of "```" in line
codeblock = not codeblock
continue
if codeblock:
continue
# ==== headings ====
if line.startswith("#"): # heading or tag
tokens = line.split()
hashtags = tokens[0]
# all chars are "#" and there is text after a space (is definitely a heading)
if len(set(hashtags)) == 1 and len(tokens) > 1:
heading_text = line[len(hashtags):].strip()
heading = {
"heading": heading_text,
"level": min(len(hashtags), 6)
}
file_metadata["headings"].append(heading)
# ==== tags ====
tags = []
if " #" in line:
tags = line.split(" #")
tags = tags[1:]
if line[0] == "#":
if not line[1] in ["#", " "]:
tags.append(line[2:line.find(" #")])
for tag in tags:
if tag[0] in [" ", "#"]:
continue
tag = tag.strip("#")
if " " in tag:
tag = tag.split()[0]
if not tag == "":
file_metadata["tags"].append(tag.lower())
# ==== markdown links ====
if "[" in line and "]" in line and "(" in line and ")" in line:
for link in line.split("["):
if not "]" in link:
continue
tokens = link.split("]")
display_text = tokens[0]
url = tokens[1]
if url == "" or not url[0] == "(" or not url[-1] == ")": # link has no proper url enclosure
continue
url = url[1:-1]
url = url.split()[0]
link = url.split("/")[-1].strip()
relpath = os.path.relpath(os.path.join(path, "../" + url.strip()), self.path)
if not relpath.endswith(".md"):
relpath += ".md"
link = {"link": link, "relativePath": relpath, "displayText": display_text}
file_metadata["links"].append(link)
# ==== wikilinks ====
if not "[[" in line or not "]]" in line:
continue
links = line.split("[[")
for link in links:
if not "]]" in link or link.startswith("#"): # link has no end or is leading to a heading
continue
link = link.split("]]")[0]
tokens = link.split("|") # ["relPath, "link"]
link = tokens[0].split("/")[-1].strip().replace("\\", "")
relpath = os.path.relpath(
os.path.join(path, "../" + tokens[0].strip().replace("\\", "")),
self.path
)
if not relpath.endswith(".md"):
relpath += ".md"
link_data = {"link": link, "relativePath": relpath}
if len(tokens) > 1 and not tokens[1] == link:
link_data["displayText"] = tokens[1]
file_metadata["links"].append(link_data)
return file_metadata
def dump(self, path: str | os.PathLike=None, indent: any=2):
if path is None:
path = self.path + "metadata.json"
file = open(path, "w")
json.dump(self.md_files, file, indent=indent)
file.close()
def recursive_update(a: collections.abc.Mapping, b: collections.abc.Mapping) -> dict:
"""
Improvement of the builtin function ``dict.update()`` which also updates sub-dicts and lists recursively.
:param dict a: Dict to update
:param dict b: Dict containing values to add
:return: The updated dict
"""
for key, value in b.items():
if isinstance(value, collections.abc.Mapping):
if key in a:
a[key] = recursive_update(a[key], value)
else:
a[key] = value
elif isinstance(value, list):
if key in a:
a[key].extend(value)
else:
a[key] = value
else:
a[key] = value
return a
if __name__ == "__main__":
if len(sys.argv) > 1:
vault_path = sys.argv[1]
metadata_path = None
if len(sys.argv) > 2:
metadata_path = sys.argv[2]
metadata = OMG(vault_path)
metadata.dump(metadata_path)