262 lines
7.5 KiB
Python
Executable file
262 lines
7.5 KiB
Python
Executable file
#!/usr/bin/python3
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import json
|
|
import collections.abc
|
|
|
|
|
|
class OMG:
|
|
"""
|
|
OMG - Obsidian Metadata Generator
|
|
|
|
Generates metadata for markdown files.
|
|
(Like the Obsidian-plugin "Metadata Extractor" does.)
|
|
"""
|
|
|
|
def __init__(self, path: str | os.PathLike, include_hidden: bool=False) -> None:
|
|
"""
|
|
Generates metadata for markdown files located at the given path.
|
|
(Like the Obsidian-plugin "Metadata Extractor" does.)
|
|
|
|
:param PathLike path: The path where the markdown files are located
|
|
:param bool include_hidden: If True, hidden folders and files (dotfiles) will also get searched.
|
|
"""
|
|
|
|
self.path = path
|
|
|
|
if not self.path[-1] == "/":
|
|
self.path += "/"
|
|
|
|
self.include_hidden = include_hidden
|
|
|
|
self.md_files = self._parse_all_files(self.path)
|
|
|
|
def _parse_all_files(self, root: str | os.PathLike, path: str | os.PathLike = None) -> list:
|
|
if path is None:
|
|
path = root
|
|
|
|
md_files = []
|
|
|
|
for filename in os.listdir(path):
|
|
if filename in [".git", ".obsidian"]: # exclude .git and .obsidian
|
|
continue
|
|
|
|
if filename[0] == "." and not self.include_hidden:
|
|
continue
|
|
|
|
filepath = path + filename
|
|
|
|
if os.path.isdir(filepath):
|
|
md_files.extend(self._parse_all_files(root, filepath + "/")) # recurse into subfolders
|
|
continue
|
|
|
|
if not filename.endswith(".md"): # only parse markdown files
|
|
continue
|
|
|
|
file_metadata = {"fileName": filename[:-3], "relativePath": os.path.relpath(filepath, root)}
|
|
file_metadata.update(self._parse_yaml_frontmatter(filepath)) # add yaml frontmatter
|
|
|
|
file_metadata = recursive_update(file_metadata, self._parse_md_contents(filepath))
|
|
|
|
md_files.append(file_metadata)
|
|
|
|
return md_files
|
|
|
|
def _parse_yaml_frontmatter(self, path: str | os.PathLike) -> dict:
|
|
file = open(path)
|
|
|
|
frontmatter_header = file.read(4)
|
|
if not frontmatter_header == "---\n": # file has no frontmatter
|
|
file.close()
|
|
return {}
|
|
|
|
frontmatter = file.read().split("\n---\n")[0]
|
|
|
|
file.close()
|
|
|
|
frontmatter_data = yaml.safe_load(frontmatter)
|
|
|
|
tags_lower = []
|
|
for tag in frontmatter_data["tags"]:
|
|
tags_lower.append(tag.lower())
|
|
|
|
frontmatter_data["tags"] = tags_lower
|
|
|
|
return frontmatter_data
|
|
|
|
def _parse_md_contents(self, path: str | os.PathLike):
|
|
file = open(path)
|
|
content = file.read()
|
|
file.close()
|
|
|
|
file_metadata = {
|
|
"headings": [],
|
|
"tags": [],
|
|
"links": []
|
|
}
|
|
|
|
codeblock = False
|
|
|
|
for line in content.split("\n"):
|
|
if line == "":
|
|
continue
|
|
|
|
if "```" in line: # detect codeblocks and ignore everything inside it
|
|
if line.count("```") % 2: # odd count of "```" in line
|
|
codeblock = not codeblock
|
|
|
|
continue
|
|
|
|
if codeblock:
|
|
continue
|
|
|
|
# ==== headings ====
|
|
if line.startswith("#"): # heading or tag
|
|
tokens = line.split()
|
|
hashtags = tokens[0]
|
|
|
|
# all chars are "#" and there is text after a space (is definitely a heading)
|
|
if len(set(hashtags)) == 1 and len(tokens) > 1:
|
|
heading_text = line[len(hashtags):].strip()
|
|
heading = {
|
|
"heading": heading_text,
|
|
"level": min(len(hashtags), 6)
|
|
}
|
|
|
|
file_metadata["headings"].append(heading)
|
|
|
|
# ==== tags ====
|
|
tags = []
|
|
|
|
if " #" in line:
|
|
tags = line.split(" #")
|
|
tags = tags[1:]
|
|
|
|
if line[0] == "#":
|
|
if not line[1] in ["#", " "]:
|
|
tags.append(line[2:line.find(" #")])
|
|
|
|
for tag in tags:
|
|
if tag[0] in [" ", "#"]:
|
|
continue
|
|
|
|
tag = tag.strip("#")
|
|
|
|
if " " in tag:
|
|
tag = tag.split()[0]
|
|
|
|
if not tag == "":
|
|
file_metadata["tags"].append(tag.lower())
|
|
|
|
# ==== markdown links ====
|
|
if "[" in line and "]" in line and "(" in line and ")" in line:
|
|
for link in line.split("["):
|
|
if not "]" in link:
|
|
continue
|
|
|
|
tokens = link.split("]")
|
|
|
|
display_text = tokens[0]
|
|
url = tokens[1]
|
|
|
|
if url == "" or not url[0] == "(" or not url[-1] == ")": # link has no proper url enclosure
|
|
continue
|
|
|
|
url = url[1:-1]
|
|
url = url.split()[0]
|
|
|
|
link = url.split("/")[-1].strip()
|
|
relpath = os.path.relpath(os.path.join(path, "../" + url.strip()), self.path)
|
|
|
|
if not relpath.endswith(".md"):
|
|
relpath += ".md"
|
|
|
|
link = {"link": link, "relativePath": relpath, "displayText": display_text}
|
|
file_metadata["links"].append(link)
|
|
|
|
# ==== wikilinks ====
|
|
if not "[[" in line or not "]]" in line:
|
|
continue
|
|
|
|
links = line.split("[[")
|
|
|
|
for link in links:
|
|
if not "]]" in link or link.startswith("#"): # link has no end or is leading to a heading
|
|
continue
|
|
|
|
link = link.split("]]")[0]
|
|
|
|
tokens = link.split("|") # ["relPath, "link"]
|
|
|
|
link = tokens[0].split("/")[-1].strip().replace("\\", "")
|
|
relpath = os.path.relpath(
|
|
os.path.join(path, "../" + tokens[0].strip().replace("\\", "")),
|
|
self.path
|
|
)
|
|
|
|
if not relpath.endswith(".md"):
|
|
relpath += ".md"
|
|
|
|
link_data = {"link": link, "relativePath": relpath}
|
|
|
|
if len(tokens) > 1 and not tokens[1] == link:
|
|
link_data["displayText"] = tokens[1]
|
|
|
|
file_metadata["links"].append(link_data)
|
|
|
|
return file_metadata
|
|
|
|
def dump(self, path: str | os.PathLike=None, indent: any=2):
|
|
if path is None:
|
|
path = self.path + "metadata.json"
|
|
|
|
file = open(path, "w")
|
|
|
|
json.dump(self.md_files, file, indent=indent)
|
|
|
|
file.close()
|
|
|
|
|
|
def recursive_update(a: collections.abc.Mapping, b: collections.abc.Mapping) -> dict:
|
|
"""
|
|
Improvement of the builtin function ``dict.update()`` which also updates sub-dicts and lists recursively.
|
|
|
|
:param dict a: Dict to update
|
|
:param dict b: Dict containing values to add
|
|
:return: The updated dict
|
|
"""
|
|
|
|
for key, value in b.items():
|
|
if isinstance(value, collections.abc.Mapping):
|
|
if key in a:
|
|
a[key] = recursive_update(a[key], value)
|
|
|
|
else:
|
|
a[key] = value
|
|
|
|
elif isinstance(value, list):
|
|
if key in a:
|
|
a[key].extend(value)
|
|
|
|
else:
|
|
a[key] = value
|
|
|
|
else:
|
|
a[key] = value
|
|
|
|
return a
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) > 1:
|
|
vault_path = sys.argv[1]
|
|
metadata_path = None
|
|
|
|
if len(sys.argv) > 2:
|
|
metadata_path = sys.argv[2]
|
|
|
|
metadata = OMG(vault_path)
|
|
metadata.dump(metadata_path)
|
|
|