diff --git a/.changeset/strange-ads-tan.md b/.changeset/strange-ads-tan.md new file mode 100644 index 0000000..acc67f5 --- /dev/null +++ b/.changeset/strange-ads-tan.md @@ -0,0 +1,7 @@ +--- +"mddb": minor +--- + +Add Tags Extraction from Markdown Content. +Resolved issues with link extraction from Markdown documents. +Conducted code refactoring for improved readability and maintainability. \ No newline at end of file diff --git a/__mocks__/content/index.mdx b/__mocks__/content/index.mdx index 46d4d2e..56d9c80 100644 --- a/__mocks__/content/index.mdx +++ b/__mocks__/content/index.mdx @@ -1,5 +1,8 @@ --- title: Homepage +tags: tag1, tag2, tag3 --- -# Welcome \ No newline at end of file +# Welcome + +[link](blog0.mdx) diff --git a/src/lib/indexFolder.ts b/src/lib/indexFolder.ts new file mode 100644 index 0000000..069f53c --- /dev/null +++ b/src/lib/indexFolder.ts @@ -0,0 +1,33 @@ +import { recursiveWalkDir } from "../utils/index.js"; +import { FileInfo, processFile } from "./process.js"; + +export function indexFolder( + folderPath: string, + pathToUrlResolver: (filePath: string) => string, + ignorePatterns?: RegExp[] +) { + const filePathsToIndex = recursiveWalkDir(folderPath); + const filteredFilePathsToIndex = filePathsToIndex.filter((filePath) => + shouldIncludeFile(filePath, ignorePatterns) + ); + const files: FileInfo[] = []; + for (const filePath of filteredFilePathsToIndex) { + const fileObject = processFile( + folderPath, + filePath, + pathToUrlResolver, + filePathsToIndex + ); + files.push(fileObject); + } + return files; +} + +function shouldIncludeFile( + filePath: string, + ignorePatterns?: RegExp[] +): boolean { + return !( + ignorePatterns && ignorePatterns.some((pattern) => pattern.test(filePath)) + ); +} diff --git a/src/lib/markdowndb.spec.ts b/src/lib/markdowndb.spec.ts index 5066512..585ea47 100644 --- a/src/lib/markdowndb.spec.ts +++ b/src/lib/markdowndb.spec.ts @@ -189,6 +189,9 @@ describe("MarkdownDB - default config", () => { test("can get all tags", async () => { const dbTags = await mddb.getTags(); const extectedTags = [ + { name: "tag1" }, + { name: "tag2" }, + { name: "tag3" }, { name: "economy" }, { name: "politics" }, { name: "sports" }, @@ -289,7 +292,7 @@ describe("MarkdownDB - custom config", () => { await mddb.init(); await mddb.indexFolder({ folderPath: pathToContentFixture, - ignorePatterns: [/\/ignore\/.*/], + ignorePatterns: [/[\\/]ignore[\\/].*/], pathToUrlResolver: (path) => path .replace(/\.mdx?$/, "") diff --git a/src/lib/markdowndb.ts b/src/lib/markdowndb.ts index d65a31d..67002d7 100644 --- a/src/lib/markdowndb.ts +++ b/src/lib/markdowndb.ts @@ -1,19 +1,16 @@ -import crypto from "crypto"; -import fs from "fs"; import path from "path"; import knex, { Knex } from "knex"; -import { recursiveWalkDir, parseFile, WikiLink } from "../utils/index.js"; +import { MddbFile, MddbTag, MddbLink, MddbFileTag } from "./schema.js"; +import { indexFolder } from "./indexFolder.js"; import { - File, - MddbFile, - Link, - Tag, - FileTag, - MddbTag, - MddbFileTag, - MddbLink, -} from "./schema.js"; + resetDatabaseTables, + mapFileToInsert, + mapLinksToInsert, + isLinkToDefined, + mapFileTagsToInsert, + getUniqueValues, +} from "../utils/databaseUtils.js"; const defaultFilePathToUrl = (filePath: string) => { let url = filePath @@ -60,164 +57,29 @@ export class MarkdownDB { ignorePatterns?: RegExp[]; pathToUrlResolver?: (filePath: string) => string; }) { - // Temporary, we don't want to handle updates now - // so database is refreshed every time the folder - // is indexed - await MddbFile.deleteTable(this.db); - await MddbTag.deleteTable(this.db); - await MddbFileTag.deleteTable(this.db); - await MddbLink.deleteTable(this.db); - - await MddbFile.createTable(this.db); - await MddbTag.createTable(this.db); - await MddbFileTag.createTable(this.db); - await MddbLink.createTable(this.db); - - const filePathsToIndex = recursiveWalkDir(folderPath); - - const filesToInsert: File[] = []; - const fileTagsToInsert: FileTag[] = []; - // TODO shouldn't available tags be explicitly defined in some config file - // instead of being extracted from all files? I think it's better even from user perspective - // as he can easily manage and see all the tags he is using - // (he can qickly look up tag if he's not sure what term he was using in other files) - // + it's easier to implement - const tagsToInsert: Tag[] = []; - const linksToInsert: Link[] = []; - - // TODO is there a better way to do this? - // Temporary containter for storing links extracted from each file - // as a map of file id -> extracted links. - // This is used after all files have been parsed and added to filesToInsert - // to resolve paths in links to target file ids - const filesLinksMap: { - [fileId: string]: { - url: string; - links: WikiLink[]; - }; - } = {}; - - for (const filePath of filePathsToIndex) { - if (ignorePatterns.some((pattern) => pattern.test(filePath))) { - continue; - } - - // id - // TODO this can be autogenerated by database - const encodedPath = Buffer.from(filePath, "utf-8").toString(); - const id = crypto.createHash("sha1").update(encodedPath).digest("hex"); - - // extension - const [, extension] = filePath.match(/.(\w+)$/) || []; - - if (!MddbFile.supportedExtensions.includes(extension)) { - filesToInsert.push({ - _id: id, - file_path: filePath, - extension, - url_path: null, - filetype: null, - metadata: null, - }); - continue; - } - - // url_path - const pathRelativeToFolder = path.relative(folderPath, filePath); - const urlPath = pathToUrlResolver(pathRelativeToFolder); - - // metadata, tags, links - const source: string = fs.readFileSync(filePath, { - encoding: "utf8", - flag: "r", - }); - - const { metadata, links } = parseFile(source, { - permalinks: filePathsToIndex, - }); - const filetype = metadata?.type || null; - - // TODO is there a better way to do this? - filesLinksMap[id] = { - url: urlPath, - links, - }; - - const tags = metadata?.tags || []; - tags.forEach((tag: string) => { - if (!tagsToInsert.some((t) => t.name === tag)) { - tagsToInsert.push({ name: tag }); - } - fileTagsToInsert.push({ file: id, tag }); - }); - - filesToInsert.push({ - _id: id, - file_path: filePath, - extension, - url_path: urlPath, - filetype, - metadata, - }); - } - - Object.entries(filesLinksMap).forEach(([fileId, { url, links }]) => { - links.forEach(({ linkSrc, linkType }) => { - const destPath = resolveLinkToUrlPath(linkSrc, url); - const destFile = filesToInsert.find( - (file) => file.url_path === destPath - ); - if (!destFile) { - return; - } - const linkToInsert = { - // _id: id, - from: fileId, - to: destFile._id, - link_type: linkType, - }; - linksToInsert.push(linkToInsert); - }); - }); - - if (filesToInsert.length >= 500) { - for (let i = 0; i < filesToInsert.length; i += 500) { - await MddbFile.batchInsert(this.db, filesToInsert.slice(i, i + 500)); - } - } else { - await MddbFile.batchInsert(this.db, filesToInsert); - } - - // TODO what happens if some of the files were not inserted? - // I guess inserting tags or links with such files used as foreign keys will fail too, - // but need to check - - if (tagsToInsert.length >= 500) { - for (let i = 0; i < tagsToInsert.length; i += 500) { - await MddbTag.batchInsert(this.db, tagsToInsert.slice(i, i + 500)); - } - } else { - await MddbTag.batchInsert(this.db, tagsToInsert); - } - - if (fileTagsToInsert.length >= 500) { - for (let i = 0; i < fileTagsToInsert.length; i += 500) { - await MddbFileTag.batchInsert( - this.db, - fileTagsToInsert.slice(i, i + 500) - ); - } - } else { - await MddbFileTag.batchInsert(this.db, fileTagsToInsert); - } + await resetDatabaseTables(this.db); + + const fileObjects = indexFolder( + folderPath, + pathToUrlResolver, + ignorePatterns + ); + const filesToInsert = fileObjects.map(mapFileToInsert); + const uniqueTags = getUniqueValues( + fileObjects.flatMap((file) => file.tags) + ); + const tagsToInsert = uniqueTags.map((tag) => ({ name: tag })); + const linksToInsert = fileObjects + .flatMap((fileObject) => { + return mapLinksToInsert(filesToInsert, fileObject); + }) + .filter(isLinkToDefined); + const fileTagsToInsert = fileObjects.flatMap(mapFileTagsToInsert); - if (linksToInsert.length >= 500) { - for (let i = 0; i < linksToInsert.length; i += 500) { - await MddbLink.batchInsert(this.db, linksToInsert.slice(i, i + 500)); - } - } else { - await MddbLink.batchInsert(this.db, linksToInsert); - } + await MddbFile.batchInsert(this.db, filesToInsert); + await MddbTag.batchInsert(this.db, tagsToInsert); + await MddbFileTag.batchInsert(this.db, fileTagsToInsert); + await MddbLink.batchInsert(this.db, getUniqueValues(linksToInsert)); } async getFileById(id: string): Promise { diff --git a/src/lib/process.spec.ts b/src/lib/process.spec.ts new file mode 100644 index 0000000..c59d8d6 --- /dev/null +++ b/src/lib/process.spec.ts @@ -0,0 +1,36 @@ +import { processFile } from "./process"; +import Path from "path"; + +describe("Can parse a file and get file info", () => { + const pathToContentFixture = "__mocks__/content"; + + test("can parse a file", async () => { + const filePath = "index.mdx"; + const fullPath = Path.join(pathToContentFixture, filePath); + const fileInfo = processFile( + pathToContentFixture, + fullPath, + (filePath) => filePath, + [] + ); + + expect(fileInfo.file_path).toBe(fullPath); + expect(fileInfo.url_path).toBe("index.mdx"); + expect(fileInfo.extension).toBe("mdx"); + expect(fileInfo.tags).toEqual(["tag1", "tag2", "tag3"]); + expect(fileInfo.metadata).toEqual({ + title: "Homepage", + tags: ["tag1", "tag2", "tag3"], + }); + expect(fileInfo.links).toEqual([ + { + embed: false, + from: "index.mdx", + internal: true, + text: "link", + to: "blog0.mdx", + toRaw: "blog0.mdx", + }, + ]); + }); +}); diff --git a/src/lib/process.ts b/src/lib/process.ts new file mode 100644 index 0000000..afa5798 --- /dev/null +++ b/src/lib/process.ts @@ -0,0 +1,71 @@ +import crypto from "crypto"; +import fs from "fs"; +import path from "path"; + +import { parseFile, WikiLink } from "../utils/index.js"; +import { File } from "./schema.js"; + +export interface FileInfo extends File { + tags: string[]; + links: WikiLink[]; +} + +// this file is an extraction of the file info parsing from markdowndb.ts without any sql stuff +// TODO: add back (as an option) - providing a "root folder" path for resolve +export function processFile( + rootFolder: string, + filePath: string, + pathToUrlResolver: (filePath: string) => string, + filePathsToIndex: string[] +) { + // Remove rootFolder from filePath + const relativePath = path.relative(rootFolder, filePath); + + // gets key file info if any e.g. extension (file size??) + const encodedPath = Buffer.from(relativePath, "utf-8").toString(); + const id = crypto.createHash("sha1").update(encodedPath).digest("hex"); + + // extension + const extension = path.extname(relativePath).slice(1); + + const fileInfo: FileInfo = { + _id: id, + file_path: filePath, + extension, + url_path: null, + filetype: null, + metadata: {}, + tags: [], + links: [], + }; + + // if not a file type we can parse exit here ... + // if (extension ! in list of supported extensions exit now ...) + const isExtensionSupported = extension === "md" || extension === "mdx"; + if (!isExtensionSupported) { + return fileInfo; + } + + // metadata, tags, links + const source: string = fs.readFileSync(filePath, { + encoding: "utf8", + flag: "r", + }); + + const { metadata, links } = parseFile(source, { + from: relativePath, + permalinks: filePathsToIndex, + }); + + fileInfo.url_path = pathToUrlResolver(relativePath); + fileInfo.metadata = metadata; + fileInfo.links = links; + + const filetype = metadata?.type || null; + fileInfo.filetype = filetype; + + const tags = metadata?.tags || []; + fileInfo.tags = tags; + + return fileInfo; +} diff --git a/src/utils/databaseUtils.ts b/src/utils/databaseUtils.ts new file mode 100644 index 0000000..729308d --- /dev/null +++ b/src/utils/databaseUtils.ts @@ -0,0 +1,75 @@ +import { Knex } from "knex"; +import { + MddbFile, + MddbTag, + MddbLink, + MddbFileTag, + File, +} from "../lib/schema.js"; +import path from "path"; +import { WikiLink } from "./extractWikiLinks.js"; + +export async function resetDatabaseTables(db: Knex) { + const tableNames = [MddbFile, MddbTag, MddbFileTag, MddbLink]; + // Drop and Create tables + for (const table of tableNames) { + await table.deleteTable(db); + await table.createTable(db); + } +} + +export function mapFileToInsert(file: any) { + const { _id, file_path, extension, url_path, filetype, metadata } = file; + return { _id, file_path, extension, url_path, filetype, metadata }; +} + +export function mapLinksToInsert(filesToInsert: File[], file: any) { + return file.links.map((link: WikiLink) => { + let to: string | undefined; + if (!link.internal) { + to = link.toRaw; + } else { + to = findFileToInsert(filesToInsert, link.to)?._id; + } + return { + from: file._id, + to: to, + link_type: link.embed ? "embed" : "normal", + }; + }); +} + +function findFileToInsert(filesToInsert: File[], filePath: string) { + const filePathWithoutExt = path.join( + path.dirname(filePath), + path.basename(filePath, path.extname(filePath)) + ); + + return filesToInsert.find(({ url_path }) => { + const normalizedFile = path.normalize(url_path || ""); + return normalizedFile === filePathWithoutExt; + }); +} + +export function isLinkToDefined(link: any) { + return link.to !== undefined; +} + +export function mapFileTagsToInsert(file: any) { + return file.tags.map((tag: any) => ({ + file: file._id, + tag: tag as unknown as string, + })); +} + +export function getUniqueValues(inputArray: T[]): T[] { + const uniqueArray: T[] = []; + + for (const item of inputArray) { + if (!uniqueArray.includes(item)) { + uniqueArray.push(item); + } + } + + return uniqueArray; +} diff --git a/src/utils/extractTagsFromBody.spec.ts b/src/utils/extractTagsFromBody.spec.ts new file mode 100644 index 0000000..0324b49 --- /dev/null +++ b/src/utils/extractTagsFromBody.spec.ts @@ -0,0 +1,149 @@ +import { extractTagsFromBody } from "./extractTagsFromBody"; + +describe("extractTagsFromBody", () => { + test("should extract tags from body", () => { + const source = "#tag"; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract tags from heading", () => { + const source = "# heading #tag"; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract 2 tags from heading", () => { + const source = "# heading #tag #tag2"; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag", "tag2"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract tags from body text", () => { + const source = "This is a #tag in the body text."; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract 2 tags from body text", () => { + const source = "This is #tag1 and #tag2 in the body text."; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag1", "tag2"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract tags from both heading and body text", () => { + const source = `# head #tag + in heading and also in the #tag-body body text.`; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag", "tag-body"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract tags with numbers", () => { + const source = "This is #tag123 with numbers."; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag123"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract tags with special characters", () => { + const source = + "This is #special-tag #special_tag2 with special characters."; + const tags = extractTagsFromBody(source); + const expectedTags = ["special-tag", "special_tag2"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract tags with slash", () => { + const source = "This is #tag/with/slash."; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag/with/slash"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract tags with multiple tags in a line", () => { + const source = "#tag1 #tag2 #tag3"; + const tags = extractTagsFromBody(source); + const expectedTags = ["tag1", "tag2", "tag3"]; + expect(tags).toEqual(expectedTags); + }); + + // for now we will pass the body content only not the whole source + test("shouldn't extract frontmatter tags", () => { + const content = ` + No tags in this content. + #gr3 + `; + const tags = extractTagsFromBody(content); + const expectedTags: string[] = ["gr3"]; + expect(tags).toEqual(expectedTags); + }); + + test("should extract tags from multiline text", () => { + const source = `This is a multiline text with #tag1 and #tag2. + Multiple tags on different lines: + #tag3 + #tag4 + And another tag: #tag5. + `; + const tags = extractTagsFromBody(source); + const expectedTags: string[] = ["tag1", "tag2", "tag3", "tag4", "tag5"]; + expect(tags).toEqual(expectedTags); + }); + + test("should handle multiple tags in the same line", () => { + const source = `#tag1 #tag2 #tag3 + #tag4 #tag5`; + const tags = extractTagsFromBody(source); + const expectedTags: string[] = ["tag1", "tag2", "tag3", "tag4", "tag5"]; + expect(tags).toEqual(expectedTags); + }); + + test("should handle tags with numbers and slashes in multiline text", () => { + const source = `Tags with numbers: #tag123 and #tag456. + Tags with slashes: #tag/one and #tag/two/three. + `; + const tags = extractTagsFromBody(source); + const expectedTags: string[] = [ + "tag123", + "tag456", + "tag/one", + "tag/two/three", + ]; + expect(tags).toEqual(expectedTags); + }); + + test("should handle tags with special characters in multiline text", () => { + const source = `Tags with special characters: #special-tag and #tag$percent. + Another tag: #tag_with_underscore. + `; + const tags = extractTagsFromBody(source); + const expectedTags: string[] = [ + "special-tag", + "tag", + "tag_with_underscore", + ]; + expect(tags).toEqual(expectedTags); + }); + + test("should handle edge case with no tags in multiline text", () => { + const source = `No tags in this multiline content. + Another line without tags. + `; + const tags = extractTagsFromBody(source); + const expectedTags: string[] = []; + expect(tags).toEqual(expectedTags); + }); + + test("should handle edge case with no tags", () => { + const source = "No tags in this content."; + const tags = extractTagsFromBody(source); + const expectedTags: string[] = []; + expect(tags).toEqual(expectedTags); + }); +}); diff --git a/src/utils/extractTagsFromBody.ts b/src/utils/extractTagsFromBody.ts new file mode 100644 index 0000000..d1f5152 --- /dev/null +++ b/src/utils/extractTagsFromBody.ts @@ -0,0 +1,29 @@ +import markdown from "remark-parse"; +import { unified } from "unified"; +import { selectAll, Node } from "unist-util-select"; + +export interface TagExtractors { + [test: string]: (node: Node) => string[]; // Updated interface for tag extractors +} + +const extractTagsFromBody = (source: string) => { + let tags: string[] = []; + + const processor = unified().use(markdown); + + const ast = processor.parse(source); + const nodes = selectAll("*", ast); + for (let index = 0; index < nodes.length; index++) { + const node: any = nodes[index]; + if (node.value) { + const textTags = node.value.match(/(?:^|\s)(#(\w+|\/|-|_)+)/g); + if (textTags) { + tags = tags.concat(textTags.map((tag: string) => tag.trim().slice(1))); // Extract tags and remove the '#' + } + } + } + + return tags; +}; + +export { extractTagsFromBody }; diff --git a/src/utils/extractWikiLinks.spec.ts b/src/utils/extractWikiLinks.spec.ts index 332e31c..671eade 100644 --- a/src/utils/extractWikiLinks.spec.ts +++ b/src/utils/extractWikiLinks.spec.ts @@ -8,38 +8,137 @@ import { extractWikiLinks } from "./extractWikiLinks"; // TODO test with other remark plugins e.g. original wiki links describe("extractWikiLinks", () => { + const from = "abc/foobar.md"; + describe("Common Mark links", () => { test("should extract CommonMark links", () => { - const source = "[Page 1](page-1) [Page 2](page-2) [Page 3](page-3)"; + const source = "[Page 1](page-1)"; + const links = extractWikiLinks(from, source); const expectedLinks = [ - { linkType: "normal", linkSrc: "page-1" }, - { linkType: "normal", linkSrc: "page-2" }, - { linkType: "normal", linkSrc: "page-3" }, + { + from: "abc/foobar.md", + to: "abc/page-1", + toRaw: "page-1", + text: "Page 1", + embed: false, + internal: true, + }, ]; - const links = extractWikiLinks(source); - expect(links).toHaveLength(expectedLinks.length); - links.forEach((link) => { - expect(expectedLinks).toContainEqual(link); - }); + expect(links).toEqual(expectedLinks); + }); + + test("should extract CommonMark links with image extension", () => { + const source = "[hello](world.png)"; + const links = extractWikiLinks(from, source); + const expectedLinks = [ + { + from: "abc/foobar.md", + to: "abc/world.png", + toRaw: "world.png", + text: "hello", + embed: false, + internal: true, + }, + ]; + expect(links).toEqual(expectedLinks); + }); + + test("should extract CommonMark links with non-image extension", () => { + const source = "[hello](world.mdx)"; + const links = extractWikiLinks(from, source); + const expectedLinks = [ + { + from: "abc/foobar.md", + to: "abc/world.mdx", + toRaw: "world.mdx", + text: "hello", + embed: false, + internal: true, + }, + ]; + expect(links).toEqual(expectedLinks); }); - test("should extract embed type CommonMark links", () => { - const source = "![abc](My_File.png)"; - const expectedLinks = [{ linkType: "embed", linkSrc: "My_File.png" }]; - const links = extractWikiLinks(source); - expect(links[0]).toEqual(expectedLinks[0]); + test("should extract CommonMark links with absolute path", () => { + const source = "[hello](/world)"; + const links = extractWikiLinks(from, source); + const expectedLinks = [ + { + from: "abc/foobar.md", + to: "abc/world", + toRaw: "/world", + text: "hello", + embed: false, + internal: true, + }, + ]; + expect(links).toEqual(expectedLinks); + }); + + test("should extract CommonMark image links", () => { + const source = "![hello](world.png)"; + const links = extractWikiLinks(from, source); + const expectedLinks = [ + { + from: "abc/foobar.md", + to: "abc/world.png", + toRaw: "world.png", + text: "hello", + embed: true, + internal: true, + }, + ]; + expect(links).toEqual(expectedLinks); + }); + + test("should extract CommonMark image links without alt text", () => { + const source = "![](world.png)"; + const links = extractWikiLinks(from, source); + const expectedLinks = [ + { + from: "abc/foobar.md", + to: "abc/world.png", + toRaw: "world.png", + text: "", + embed: true, + internal: true, + }, + ]; + expect(links).toEqual(expectedLinks); }); }); + // TODO Obsidian wiki links describe("Obsidian wiki links", () => { test("should extract wiki links", () => { const source = "[[Page 1]] [[Page 2]] [[Page 3]]"; const expectedLinks = [ - { linkType: "normal", linkSrc: "Page 1" }, - { linkType: "normal", linkSrc: "Page 2" }, - { linkType: "normal", linkSrc: "Page 3" }, + { + embed: false, + from: "abc/foobar.md", + internal: true, + text: "", + to: "abc/Page 1", + toRaw: "Page 1", + }, + { + embed: false, + from: "abc/foobar.md", + internal: true, + text: "", + to: "abc/Page 2", + toRaw: "Page 2", + }, + { + embed: false, + from: "abc/foobar.md", + internal: true, + text: "", + to: "abc/Page 3", + toRaw: "Page 3", + }, ]; - const links = extractWikiLinks(source); + const links = extractWikiLinks("abc/foobar.md", source); expect(links).toHaveLength(expectedLinks.length); links.forEach((link) => { expect(expectedLinks).toContainEqual(link); @@ -49,16 +148,37 @@ describe("extractWikiLinks", () => { test("should extract wiki links with Obsidian-style shortest path", () => { const source = "[[Page 1]] [[Page 2]] [[Page 3]]"; const expectedLinks = [ - { linkType: "normal", linkSrc: "/some/folder/Page 1" }, - { linkType: "normal", linkSrc: "/some/folder/Page 2" }, - { linkType: "normal", linkSrc: "/some/folder/Page 3" }, + { + embed: false, + from: "abc/foobar.md", + internal: true, + text: "", + to: "abc/some/folder/Page 1", + toRaw: "/some/folder/Page 1", + }, + { + embed: false, + from: "abc/foobar.md", + internal: true, + text: "", + to: "abc/some/folder/Page 2", + toRaw: "/some/folder/Page 2", + }, + { + embed: false, + from: "abc/foobar.md", + internal: true, + text: "", + to: "abc/some/folder/Page 3", + toRaw: "/some/folder/Page 3", + }, ]; const permalinks = [ "/some/folder/Page 1", "/some/folder/Page 2", "/some/folder/Page 3", ]; - const links = extractWikiLinks(source, { permalinks }); + const links = extractWikiLinks("abc/foobar.md", source, { permalinks }); expect(links).toHaveLength(expectedLinks.length); links.forEach((link) => { expect(expectedLinks).toContainEqual(link); @@ -67,9 +187,18 @@ describe("extractWikiLinks", () => { test("should extract embedded wiki links", () => { const source = "![[My File.png]]]]"; - const expectedLinks = [{ linkType: "embed", linkSrc: "My File.png" }]; - const links = extractWikiLinks(source); - expect(links[0]).toEqual(expectedLinks[0]); + const expectedLinks = [ + { + from: "abc/foobar.md", + to: "abc/My File.png", + toRaw: "My File.png", + text: "", + embed: true, + internal: true, + }, + ]; + const links = extractWikiLinks("abc/foobar.md", source); + expect(links).toEqual(expectedLinks); }); }); @@ -88,21 +217,31 @@ describe("extractWikiLinks", () => { // }); // }); - test("shouldn't extract external links", () => { + test("should extract external links", () => { const source = "[External Link](https://example.com)"; - const links = extractWikiLinks(source); - expect(links).toHaveLength(0); + const links = extractWikiLinks("abc/foobar.md", source); + const expectedLinks = [ + { + from: "abc/foobar.md", + to: "https://example.com", + toRaw: "https://example.com", + text: "External Link", + embed: false, + internal: false, + }, + ]; + expect(links).toEqual(expectedLinks); }); test("should return empty array if no links are found", () => { const source = "No links here"; - const links = extractWikiLinks(source); + const links = extractWikiLinks(from, source); expect(links).toHaveLength(0); }); test("should return empty array if page is empty", () => { const source = ""; - const links = extractWikiLinks(source); + const links = extractWikiLinks(from, source); expect(links).toHaveLength(0); }); }); diff --git a/src/utils/extractWikiLinks.ts b/src/utils/extractWikiLinks.ts index abdd0c3..951d89d 100644 --- a/src/utils/extractWikiLinks.ts +++ b/src/utils/extractWikiLinks.ts @@ -3,6 +3,7 @@ import { unified, Plugin } from "unified"; import { selectAll } from "unist-util-select"; import remarkWikiLink from "@portaljs/remark-wiki-link"; import gfm from "remark-gfm"; +import * as path from "path"; export interface ExtractWikiLinksOptions { remarkPlugins?: Array; // remark plugins that add custom nodes to the AST @@ -15,40 +16,72 @@ export interface LinkExtractors { } export interface WikiLink { - linkSrc: string; - linkType: "normal" | "embed"; + from: string; + to: string; + toRaw: string; // raw link to + text: string; + embed: boolean; // is it an embed link (default: false) + internal: boolean; // default true (external means http etc - not inside the contentbase) } const extractWikiLinks = ( + from: string, source: string, options?: ExtractWikiLinksOptions ) => { let wikiLinks: WikiLink[] = []; const userExtractors: LinkExtractors = options?.extractors || {}; const userRemarkPlugins: Array = options?.remarkPlugins || []; + const directory = path.dirname(from); const extractors: LinkExtractors = { - link: (node: any) => ({ - linkSrc: node.url, - linkType: "normal", - }), + link: (node: any) => { + const to = !node.url.startsWith("http") + ? path.posix.join(directory, node.url) + : node.url; + return { + from: from, + to: to, + toRaw: node.url, + text: node.children?.[0]?.value || "", + embed: false, + internal: !node.url.startsWith("http"), + }; + }, image: (node: any) => ({ - linkSrc: node.url, - linkType: "embed", + from: from, + to: path.posix.join(directory, node.url), + toRaw: node.url, + text: node.alt || "", + embed: true, + internal: !node.url.startsWith("http"), }), - wikiLink: (node: any) => { + wikiLink: (node) => { const linkType = node.data.isEmbed ? "embed" : "normal"; let linkSrc = ""; + let text = ""; + if (node.data.hName === "img" || node.data.hName === "iframe") { linkSrc = node.data.hProperties.src; + text = node.children?.[0]?.value || ""; } else if (node.data.hName === "a") { linkSrc = node.data.hProperties.href; + text = node.children?.[0]?.value || ""; } else { linkSrc = node.data.permalink; + text = node.children?.[0]?.value || ""; } + const to = !linkSrc.startsWith("http") + ? path.posix.join(directory, linkSrc) + : linkSrc; + return { - linkSrc, - linkType, + from: from, + to: to, + toRaw: linkSrc, + text, + embed: linkType === "embed", + internal: !linkSrc.startsWith("http"), }; }, ...userExtractors, @@ -69,9 +102,7 @@ const extractWikiLinks = ( Object.entries(extractors).forEach(([test, extractor]) => { const nodes = selectAll(test, ast); - const extractedWikiLinks: WikiLink[] = nodes - .map((node: any) => extractor(node)) - .filter((link: WikiLink) => !link.linkSrc.startsWith("http")); + const extractedWikiLinks: WikiLink[] = nodes.map((node) => extractor(node)); wikiLinks = wikiLinks.concat(extractedWikiLinks); }); diff --git a/src/utils/parseFile.spec.ts b/src/utils/parseFile.spec.ts index 28de0e3..041900e 100644 --- a/src/utils/parseFile.spec.ts +++ b/src/utils/parseFile.spec.ts @@ -20,10 +20,38 @@ describe("parseFile", () => { tags: ["a", "b", "c"], }; const expectedLinks = [ - { linkType: "normal", linkSrc: "Some Link" }, - { linkType: "normal", linkSrc: "blog/Some Other Link" }, - { linkType: "normal", linkSrc: "blog/Some Other Link" }, - { linkType: "embed", linkSrc: "Some Image.png" }, + { + embed: false, + from: "", + internal: true, + text: "", + to: "Some Link", + toRaw: "Some Link", + }, + { + embed: false, + from: "", + internal: true, + text: "", + to: "blog/Some Other Link", + toRaw: "blog/Some Other Link", + }, + { + embed: false, + from: "", + internal: true, + text: "", + to: "blog/Some Other Link", + toRaw: "blog/Some Other Link", + }, + { + embed: true, + from: "", + internal: true, + text: "", + to: "Some Image.png", + toRaw: "Some Image.png", + }, ]; const { metadata, links } = parseFile(source); expect(metadata).toEqual(expectedMetadata); @@ -37,10 +65,38 @@ describe("parseFile", () => { tags: ["a", "b", "c"], }; const expectedLinks = [ - { linkType: "normal", linkSrc: "/some/folder/Some Link" }, - { linkType: "normal", linkSrc: "/some/folder/blog/Some Other Link" }, - { linkType: "normal", linkSrc: "/some/folder/blog/Some Other Link" }, - { linkType: "embed", linkSrc: "/some/folder/Some Image.png" }, + { + embed: false, + from: "", + internal: true, + text: "", + to: "some/folder/Some Link", + toRaw: "/some/folder/Some Link", + }, + { + embed: false, + from: "", + internal: true, + text: "", + to: "some/folder/blog/Some Other Link", + toRaw: "/some/folder/blog/Some Other Link", + }, + { + embed: false, + from: "", + internal: true, + text: "", + to: "some/folder/blog/Some Other Link", + toRaw: "/some/folder/blog/Some Other Link", + }, + { + embed: true, + from: "", + internal: true, + text: "", + to: "some/folder/Some Image.png", + toRaw: "/some/folder/Some Image.png", + }, ]; const permalinks = [ "/some/folder/Some Link", diff --git a/src/utils/parseFile.ts b/src/utils/parseFile.ts index 6a2d21a..7060f8a 100644 --- a/src/utils/parseFile.ts +++ b/src/utils/parseFile.ts @@ -1,7 +1,11 @@ import matter from "gray-matter"; import { extractWikiLinks } from "./extractWikiLinks.js"; +import { extractTagsFromBody } from "./extractTagsFromBody.js"; -export function parseFile(source: string, options?: { permalinks?: string[] }) { +export function parseFile( + source: string, + options?: { from?: string; permalinks?: string[] } +) { // Metadata const { data: metadata } = matter(source); @@ -10,8 +14,13 @@ export function parseFile(source: string, options?: { permalinks?: string[] }) { metadata.tags = metadata.tags.split(",").map((tag: string) => tag.trim()); } + const bodyTags = extractTagsFromBody(source); + metadata.tags = metadata.tags ? [...metadata.tags, ...bodyTags] : bodyTags; + // Links - const links = extractWikiLinks(source, { permalinks: options?.permalinks }); + const links = extractWikiLinks(options?.from || "", source, { + permalinks: options?.permalinks, + }); return { metadata, diff --git a/tsconfig.spec.json b/tsconfig.spec.json index 7453ec9..dffd0b0 100644 --- a/tsconfig.spec.json +++ b/tsconfig.spec.json @@ -15,6 +15,7 @@ "src/**/*.spec.js", "src/**/*.test.jsx", "src/**/*.spec.jsx", - "src/**/*.d.ts" + "src/**/*.d.ts", + "src/**/*.ts" ] }