OMG/omg.py

#!/usr/bin/python3

import os
import sys
import yaml
import json
import collections.abc


class OMG:
    """
    OMG - Obsidian Metadata Generator

    Generates metadata for markdown files.
    (Like the Obsidian-plugin "Metadata Extractor" does.)
    """

    def __init__(self, path: str | os.PathLike, include_hidden: bool=False) -> None:
        """
        Generates metadata for markdown files located at the given path.
        (Like the Obsidian-plugin "Metadata Extractor" does.)

        :param PathLike path: The path where the markdown files are located
        :param bool include_hidden: If True, hidden folders and files (dotfiles) will also get searched.
        """

        self.path = path

        if not self.path[-1] == "/":
            self.path += "/"

        self.include_hidden = include_hidden

        self.md_files = self._parse_all_files(self.path)

    def _parse_all_files(self, root: str | os.PathLike, path: str | os.PathLike = None) -> list:
        if path is None:
            path = root

        md_files = []

        for filename in os.listdir(path):
            if filename in [".git", ".obsidian"]:  # exclude .git and .obsidian
                continue

            if filename[0] == "." and not self.include_hidden:
                continue

            filepath = path + filename

            if os.path.isdir(filepath):
                md_files.extend(self._parse_all_files(root, filepath + "/"))  # recurse into subfolders
                continue

            if not filename.endswith(".md"):  # only parse markdown files
                continue

            file_metadata = {"fileName": filename[:-3], "relativePath": os.path.relpath(filepath, root)}
            file_metadata.update(self._parse_yaml_frontmatter(filepath))  # add yaml frontmatter

            file_metadata = recursive_update(file_metadata, self._parse_md_contents(filepath))

            md_files.append(file_metadata)

        return md_files

    def _parse_yaml_frontmatter(self, path: str | os.PathLike) -> dict:
        file = open(path)

        frontmatter_header = file.read(4)
        if not frontmatter_header == "---\n":  # file has no frontmatter
            file.close()
            return {}

        frontmatter = file.read().split("\n---\n")[0]

        file.close()

        frontmatter_data = yaml.safe_load(frontmatter)

        tags_lower = []
        for tag in frontmatter_data["tags"]:
            tags_lower.append(tag.lower())

        frontmatter_data["tags"] = tags_lower

        return frontmatter_data

    def _parse_md_contents(self, path: str | os.PathLike):
        file = open(path)
        content = file.read()
        file.close()

        file_metadata = {
            "headings": [],
            "tags": [],
            "links": []
        }

        codeblock = False

        for line in content.split("\n"):
            if line == "":
                continue

            if "```" in line:  # detect codeblocks and ignore everything inside it
                if line.count("```") % 2:  # odd count of "```" in line
                    codeblock = not codeblock

                continue

            if codeblock:
                continue

            # ==== headings ====
            if line.startswith("#"):  # heading or tag
                tokens = line.split()
                hashtags = tokens[0]

                # all chars are "#" and there is text after a space (is definitely a heading)
                if len(set(hashtags)) == 1 and len(tokens) > 1:
                    heading_text = line[len(hashtags):].strip()
                    heading = {
                        "heading": heading_text,
                        "level": min(len(hashtags), 6)
                    }

                    file_metadata["headings"].append(heading)

            # ==== tags ====
            tags = []

            if " #" in line:
                tags = line.split(" #")
                tags = tags[1:]

            if line[0] == "#":
                if not line[1] in ["#", " "]:
                    tags.append(line[2:line.find(" #")])

            for tag in tags:
                if tag[0] in [" ", "#"]:
                    continue

                tag = tag.strip("#")

                if " " in tag:
                    tag = tag.split()[0]

                if not tag == "":
                    file_metadata["tags"].append(tag.lower())

            # ==== markdown links ====
            if "[" in line and "]" in line and "(" in line and ")" in line:
                for link in line.split("["):
                    if not "]" in link:
                        continue

                    tokens = link.split("]")

                    display_text = tokens[0]
                    url = tokens[1]

                    if url == "" or not url[0] == "(" or not url[-1] == ")":  # link has no proper url enclosure
                        continue

                    url = url[1:-1]
                    url = url.split()[0]

                    link = url.split("/")[-1].strip()
                    relpath = os.path.relpath(os.path.join(path, "../" + url.strip()), self.path)

                    if not relpath.endswith(".md"):
                        relpath += ".md"

                    link = {"link": link, "relativePath": relpath, "displayText": display_text}
                    file_metadata["links"].append(link)

            # ==== wikilinks ====
            if not "[[" in line or not "]]" in line:
                continue

            links = line.split("[[")

            for link in links:
                if not "]]" in link or link.startswith("#"):  # link has no end or is leading to a heading
                    continue

                link = link.split("]]")[0]

                tokens = link.split("|")  # ["relPath, "link"]

                link = tokens[0].split("/")[-1].strip().replace("\\", "")
                relpath = os.path.relpath(
                    os.path.join(path, "../" + tokens[0].strip().replace("\\", "")),
                    self.path
                )

                if not relpath.endswith(".md"):
                    relpath += ".md"

                link_data = {"link": link, "relativePath": relpath}

                if len(tokens) > 1 and not tokens[1] == link:
                    link_data["displayText"] = tokens[1]

                file_metadata["links"].append(link_data)

        return file_metadata

    def dump(self, path: str | os.PathLike=None, indent: any=2):
        if path is None:
            path = self.path + "metadata.json"

        file = open(path, "w")

        json.dump(self.md_files, file, indent=indent)

        file.close()


def recursive_update(a: collections.abc.Mapping, b: collections.abc.Mapping) -> dict:
    """
    Improvement of the builtin function ``dict.update()`` which also updates sub-dicts and lists recursively.

    :param dict a: Dict to update
    :param dict b: Dict containing values to add
    :return: The updated dict
    """

    for key, value in b.items():
        if isinstance(value, collections.abc.Mapping):
            if key in a:
                a[key] = recursive_update(a[key], value)

            else:
                a[key] = value

        elif isinstance(value, list):
            if key in a:
                a[key].extend(value)

            else:
                a[key] = value

        else:
            a[key] = value

    return a


if __name__ == "__main__":
    if len(sys.argv) > 1:
        vault_path = sys.argv[1]
        metadata_path = None

        if len(sys.argv) > 2:
            metadata_path = sys.argv[2]

        metadata = OMG(vault_path)
        metadata.dump(metadata_path)