You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
568 lines
12 KiB
JavaScript
568 lines
12 KiB
JavaScript
2 years ago
|
/**
|
||
|
* HTML2Markdown - An HTML to Markdown converter.
|
||
|
*
|
||
|
* This implementation uses HTML DOM parsing for conversion. Parsing code was
|
||
|
* abstracted out in a parsing function which should be easy to remove in favor
|
||
|
* of other parsing libraries.
|
||
|
*
|
||
|
* Converted MarkDown was tested with ShowDown library for HTML rendering. And
|
||
|
* it tries to create MarkDown that does not confuse ShowDown when certain
|
||
|
* combination of HTML tags come together.
|
||
|
*
|
||
|
* @author Himanshu Gilani
|
||
|
* @author Kates Gasis (original author)
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
if (typeof require != "undefined") {
|
||
|
var htmlparser = require("./htmldomparser");
|
||
|
var HTMLParser = htmlparser.HTMLParser;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* HTML2Markdown
|
||
|
* @param html - html string to convert
|
||
|
* @return converted markdown text
|
||
|
*/
|
||
|
function HTML2Markdown(html, opts) {
|
||
|
var logging = false;
|
||
|
var nodeList = [];
|
||
|
var listTagStack = [];
|
||
|
var linkAttrStack = [];
|
||
|
var blockquoteStack = [];
|
||
|
var preStack = [];
|
||
|
|
||
|
var links = [];
|
||
|
|
||
|
opts = opts || {};
|
||
|
var inlineStyle = opts['inlineStyle'] || false;
|
||
|
|
||
|
var markdownTags = {
|
||
|
"hr": "- - -\n\n",
|
||
|
"br": " \n",
|
||
|
"title": "# ",
|
||
|
"h1": "# ",
|
||
|
"h2": "## ",
|
||
|
"h3": "### ",
|
||
|
"h4": "#### ",
|
||
|
"h5": "##### ",
|
||
|
"h6": "###### ",
|
||
|
"b": "**",
|
||
|
"strong": "**",
|
||
|
"i": "_",
|
||
|
"em": "_",
|
||
|
"dfn": "_",
|
||
|
"var": "_",
|
||
|
"cite": "_",
|
||
|
"span": " ",
|
||
|
"ul": "* ",
|
||
|
"ol": "1. ",
|
||
|
"dl": "- ",
|
||
|
"blockquote": "> "
|
||
|
};
|
||
|
|
||
|
function getListMarkdownTag() {
|
||
|
var listItem = "";
|
||
|
if(listTagStack) {
|
||
|
for ( var i = 0; i < listTagStack.length - 1; i++) {
|
||
|
listItem += " ";
|
||
|
}
|
||
|
}
|
||
|
listItem += peek(listTagStack);
|
||
|
return listItem;
|
||
|
}
|
||
|
|
||
|
function convertAttrs(attrs) {
|
||
|
var attributes = {};
|
||
|
for(var k in attrs) {
|
||
|
var attr = attrs[k];
|
||
|
attributes[attr.name] = attr;
|
||
|
}
|
||
|
return attributes;
|
||
|
}
|
||
|
|
||
|
function peek(list) {
|
||
|
if(list && list.length > 0) {
|
||
|
return list.slice(-1)[0];
|
||
|
}
|
||
|
return "";
|
||
|
}
|
||
|
|
||
|
function peekTillNotEmpty(list) {
|
||
|
if(!list) {
|
||
|
return "";
|
||
|
}
|
||
|
|
||
|
for(var i = list.length - 1; i>=0; i-- ){
|
||
|
if(list[i] != "") {
|
||
|
return list[i];
|
||
|
}
|
||
|
}
|
||
|
return "";
|
||
|
}
|
||
|
|
||
|
function removeIfEmptyTag(start) {
|
||
|
var cleaned = false;
|
||
|
if(start == peekTillNotEmpty(nodeList)) {
|
||
|
while(peek(nodeList) != start) {
|
||
|
nodeList.pop();
|
||
|
}
|
||
|
nodeList.pop();
|
||
|
cleaned = true;
|
||
|
}
|
||
|
return cleaned;
|
||
|
}
|
||
|
|
||
|
function sliceText(start) {
|
||
|
var text = [];
|
||
|
while(nodeList.length > 0 && peek(nodeList) != start) {
|
||
|
var t = nodeList.pop();
|
||
|
text.unshift(t);
|
||
|
}
|
||
|
return text.join("");
|
||
|
}
|
||
|
|
||
|
function block(isEndBlock) {
|
||
|
var lastItem = nodeList.pop();
|
||
|
if (!lastItem) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if(!isEndBlock) {
|
||
|
var block;
|
||
|
if(/\s*\n\n\s*$/.test(lastItem)) {
|
||
|
lastItem = lastItem.replace(/\s*\n\n\s*$/, "\n\n");
|
||
|
block = "";
|
||
|
} else if(/\s*\n\s*$/.test(lastItem)) {
|
||
|
lastItem = lastItem.replace(/\s*\n\s*$/, "\n");
|
||
|
block = "\n";
|
||
|
} else if(/\s+$/.test(lastItem)) {
|
||
|
block = "\n\n";
|
||
|
} else {
|
||
|
block = "\n\n";
|
||
|
}
|
||
|
|
||
|
nodeList.push(lastItem);
|
||
|
nodeList.push(block);
|
||
|
} else {
|
||
|
nodeList.push(lastItem);
|
||
|
if(!lastItem.endsWith("\n")) {
|
||
|
nodeList.push("\n\n");
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function listBlock() {
|
||
|
if(nodeList.length > 0) {
|
||
|
var li = peek(nodeList);
|
||
|
|
||
|
if(!li.endsWith("\n")) {
|
||
|
nodeList.push("\n");
|
||
|
}
|
||
|
} else {
|
||
|
nodeList.push("\n");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
try {
|
||
|
var dom;
|
||
|
if(html) {
|
||
|
var e = document.createElement('div');
|
||
|
e.innerHTML = html;
|
||
|
dom = e;
|
||
|
} else {
|
||
|
dom = window.document.body;
|
||
|
}
|
||
|
|
||
|
HTMLParser(dom,{
|
||
|
start: function(tag, attrs, unary) {
|
||
|
tag = tag.toLowerCase();
|
||
|
if(logging) {
|
||
|
console.log("start: "+ tag);
|
||
|
}
|
||
|
|
||
|
if(unary && (tag != "br" && tag != "hr" && tag != "img")) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
switch (tag) {
|
||
|
case "br":
|
||
|
nodeList.push(markdownTags[tag]);
|
||
|
break;
|
||
|
case "hr":
|
||
|
block();
|
||
|
nodeList.push(markdownTags[tag]);
|
||
|
break;
|
||
|
case "title":
|
||
|
case "h1":
|
||
|
case "h2":
|
||
|
case "h3":
|
||
|
case "h4":
|
||
|
case "h5":
|
||
|
case "h6":
|
||
|
block();
|
||
|
nodeList.push(markdownTags[tag]);
|
||
|
break;
|
||
|
case "b":
|
||
|
case "strong":
|
||
|
case "i":
|
||
|
case "em":
|
||
|
case "dfn":
|
||
|
case "var":
|
||
|
case "cite":
|
||
|
nodeList.push(markdownTags[tag]);
|
||
|
break;
|
||
|
case "span":
|
||
|
if(! /\s+$/.test(peek(nodeList))) {
|
||
|
nodeList.push(markdownTags[tag]);
|
||
|
}
|
||
|
break;
|
||
|
case "p":
|
||
|
case "div":
|
||
|
case "td":
|
||
|
block();
|
||
|
break;
|
||
|
case "ul":
|
||
|
case "ol":
|
||
|
case "dl":
|
||
|
listTagStack.push(markdownTags[tag]);
|
||
|
// lists are block elements
|
||
|
if(listTagStack.length > 1) {
|
||
|
listBlock();
|
||
|
} else {
|
||
|
block();
|
||
|
}
|
||
|
break;
|
||
|
case "li":
|
||
|
case "dt":
|
||
|
var li = getListMarkdownTag();
|
||
|
nodeList.push(li);
|
||
|
break;
|
||
|
case "a":
|
||
|
var attribs = convertAttrs(attrs);
|
||
|
linkAttrStack.push(attribs);
|
||
|
nodeList.push("[");
|
||
|
break;
|
||
|
case "img":
|
||
|
var attribs = convertAttrs(attrs);
|
||
|
var alt, title, url;
|
||
|
|
||
|
attribs["src"] ? url = getNormalizedUrl(attribs["src"].value) : url = "";
|
||
|
if(!url) {
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
attribs['alt'] ? alt = attribs['alt'].value.trim() : alt = "";
|
||
|
attribs['title'] ? title = attribs['title'].value.trim() : title = "";
|
||
|
|
||
|
// if parent of image tag is nested in anchor tag use inline style
|
||
|
if(!inlineStyle && !peekTillNotEmpty(nodeList).startsWith("[")) {
|
||
|
var l = links.indexOf(url);
|
||
|
if(l == -1) {
|
||
|
links.push(url);
|
||
|
l=links.length-1;
|
||
|
}
|
||
|
|
||
|
block();
|
||
|
nodeList.push("![");
|
||
|
if(alt!= "") {
|
||
|
nodeList.push(alt);
|
||
|
} else if (title != null) {
|
||
|
nodeList.push(title);
|
||
|
}
|
||
|
|
||
|
nodeList.push("][" + l + "]");
|
||
|
block();
|
||
|
} else {
|
||
|
//if image is not a link image then treat images as block elements
|
||
|
if(!peekTillNotEmpty(nodeList).startsWith("[")) {
|
||
|
block();
|
||
|
}
|
||
|
|
||
|
nodeList.push("![" + alt + "](" + url + (title ? " \"" + title + "\"" : "") + ")");
|
||
|
|
||
|
if(!peekTillNotEmpty(nodeList).startsWith("[")) {
|
||
|
block(true);
|
||
|
}
|
||
|
}
|
||
|
break;
|
||
|
case "blockquote":
|
||
|
block();
|
||
|
blockquoteStack.push(markdownTags[tag]);
|
||
|
nodeList.push(blockquoteStack.join(""));
|
||
|
break;
|
||
|
case "pre":
|
||
|
case "code":
|
||
|
block();
|
||
|
preStack.push(true);
|
||
|
break;
|
||
|
}
|
||
|
},
|
||
|
chars: function(text) {
|
||
|
if(preStack.length > 0) {
|
||
|
text = " " + text.replace(/\n/g,"\n ");
|
||
|
} else if(text.trim() != "") {
|
||
|
text = text.replace(/\s+/g, " ");
|
||
|
|
||
|
var prevText = peekTillNotEmpty(nodeList);
|
||
|
if(/\s+$/.test(prevText)) {
|
||
|
text = text.replace(/^\s+/g, "");
|
||
|
}
|
||
|
} else {
|
||
|
nodeList.push("");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if(logging) {
|
||
|
console.log("text: "+ text);
|
||
|
}
|
||
|
|
||
|
nodeList.push(text);
|
||
|
},
|
||
|
end: function(tag) {
|
||
|
tag = tag.toLowerCase();
|
||
|
if(logging) {
|
||
|
console.log("end: "+ tag);
|
||
|
}
|
||
|
switch (tag) {
|
||
|
case "title":
|
||
|
case "h1":
|
||
|
case "h2":
|
||
|
case "h3":
|
||
|
case "h4":
|
||
|
case "h5":
|
||
|
case "h6":
|
||
|
if(!removeIfEmptyTag(markdownTags[tag])) {
|
||
|
block(true);
|
||
|
}
|
||
|
break;
|
||
|
case "p":
|
||
|
case "div":
|
||
|
case "td":
|
||
|
while(nodeList.length > 0 && peek(nodeList).trim() == "") {
|
||
|
nodeList.pop();
|
||
|
}
|
||
|
block(true);
|
||
|
break;
|
||
|
case "b":
|
||
|
case "strong":
|
||
|
case "i":
|
||
|
case "em":
|
||
|
case "dfn":
|
||
|
case "var":
|
||
|
case "cite":
|
||
|
if(!removeIfEmptyTag(markdownTags[tag])) {
|
||
|
nodeList.push(sliceText(markdownTags[tag]).trim());
|
||
|
nodeList.push(markdownTags[tag]);
|
||
|
}
|
||
|
break;
|
||
|
case "a":
|
||
|
var text = sliceText("[");
|
||
|
text = text.replace(/\s+/g, " ");
|
||
|
text = text.trim();
|
||
|
|
||
|
if(text == "") {
|
||
|
nodeList.pop();
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
var attrs = linkAttrStack.pop();
|
||
|
var url;
|
||
|
attrs["href"] && attrs["href"].value != "" ? url = getNormalizedUrl(attrs["href"].value) : url = "";
|
||
|
|
||
|
if(url == "") {
|
||
|
nodeList.pop();
|
||
|
nodeList.push(text);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
nodeList.push(text);
|
||
|
|
||
|
if(!inlineStyle && !peek(nodeList).startsWith("!")){
|
||
|
var l = links.indexOf(url);
|
||
|
if(l == -1) {
|
||
|
links.push(url);
|
||
|
l=links.length-1;
|
||
|
}
|
||
|
nodeList.push("][" + l + "]");
|
||
|
} else {
|
||
|
if(peek(nodeList).startsWith("!")){
|
||
|
var text = nodeList.pop();
|
||
|
text = nodeList.pop() + text;
|
||
|
block();
|
||
|
nodeList.push(text);
|
||
|
}
|
||
|
|
||
|
var title = attrs["title"];
|
||
|
nodeList.push("](" + url + (title ? " \"" + title.value.trim().replace(/\s+/g, " ") + "\"" : "") + ")");
|
||
|
|
||
|
if(peek(nodeList).startsWith("!")){
|
||
|
block(true);
|
||
|
}
|
||
|
}
|
||
|
break;
|
||
|
case "ul":
|
||
|
case "ol":
|
||
|
case "dl":
|
||
|
listBlock();
|
||
|
listTagStack.pop();
|
||
|
break;
|
||
|
case "li":
|
||
|
case "dt":
|
||
|
var li = getListMarkdownTag();
|
||
|
if(!removeIfEmptyTag(li)) {
|
||
|
var text = sliceText(li).trim();
|
||
|
|
||
|
if(text.startsWith("[![")) {
|
||
|
nodeList.pop();
|
||
|
block();
|
||
|
nodeList.push(text);
|
||
|
block(true);
|
||
|
} else {
|
||
|
nodeList.push(text);
|
||
|
listBlock();
|
||
|
}
|
||
|
}
|
||
|
break;
|
||
|
case "blockquote":
|
||
|
blockquoteStack.pop();
|
||
|
break;
|
||
|
case "pre":
|
||
|
case "code":
|
||
|
block(true);
|
||
|
preStack.pop();
|
||
|
break;
|
||
|
case "span":
|
||
|
if(peek(nodeList).trim() == "") {
|
||
|
nodeList.pop();
|
||
|
if(peek(nodeList) == " ") {
|
||
|
nodeList.pop();
|
||
|
} else {
|
||
|
nodeList.push(markdownTags[tag]);
|
||
|
}
|
||
|
} else {
|
||
|
var text = nodeList.pop();
|
||
|
nodeList.push(text.trim());
|
||
|
nodeList.push(markdownTags[tag]);
|
||
|
}
|
||
|
break;
|
||
|
case "br":
|
||
|
case "hr":
|
||
|
case "img":
|
||
|
case "table":
|
||
|
case "tr":
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}, {"nodesToIgnore": ["script", "noscript", "object", "iframe", "frame", "head", "style", "label"]});
|
||
|
|
||
|
if(!inlineStyle) {
|
||
|
for ( var i = 0; i < links.length; i++) {
|
||
|
if(i == 0) {
|
||
|
var lastItem = nodeList.pop();
|
||
|
nodeList.push(lastItem.replace(/\s+$/g, ""));
|
||
|
nodeList.push("\n\n[" + i + "]: " + links[i]);
|
||
|
} else {
|
||
|
nodeList.push("\n[" + i + "]: " + links[i]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} catch(e) {
|
||
|
console.log(e.stack);
|
||
|
console.trace();
|
||
|
}
|
||
|
|
||
|
return nodeList.join("");
|
||
|
|
||
|
}
|
||
|
|
||
|
function getNormalizedUrl(s) {
|
||
|
var urlBase = location.href;
|
||
|
var urlDir = urlBase.replace(/\/[^\/]*$/, '/');
|
||
|
var urlPage = urlBase.replace(/#[^\/#]*$/, '');
|
||
|
|
||
|
var url;
|
||
|
if(/^[a-zA-Z]([a-zA-Z0-9 -.])*:/.test(s)) {
|
||
|
// already absolute url
|
||
|
url = s;
|
||
|
} else if(/^\x2f/.test(s)) {// %2f --> /
|
||
|
// url is relative to site
|
||
|
location.protocol != "" ? url = location.protocol + "//" : url ="";
|
||
|
url+= location.hostname;
|
||
|
if(location.port != "80") {
|
||
|
url+=":"+location.port;
|
||
|
}
|
||
|
url += s;
|
||
|
} else if(/^#/.test(s)) {
|
||
|
// url is relative to page
|
||
|
url = urlPage + s;
|
||
|
} else {
|
||
|
url = urlDir + s;
|
||
|
}
|
||
|
return encodeURI(url);
|
||
|
}
|
||
|
|
||
|
if (typeof exports != "undefined") {
|
||
|
exports.HTML2Markdown = HTML2Markdown;
|
||
|
}
|
||
|
|
||
|
if (typeof exports != "undefined") {
|
||
|
exports.HTML2MarkDown = HTML2MarkDown;
|
||
|
}
|
||
|
|
||
|
/* add the useful functions to String object*/
|
||
|
if (typeof String.prototype.trim != 'function') {
|
||
|
String.prototype.trim = function() {
|
||
|
return replace(/^\s+|\s+$/g,"");
|
||
|
};
|
||
|
}
|
||
|
|
||
|
if (typeof String.prototype.isNotEmpty != 'function') {
|
||
|
String.prototype.isNotEmpty = function() {
|
||
|
if (/\S/.test(this)) {
|
||
|
return true;
|
||
|
} else {
|
||
|
return false;
|
||
|
}
|
||
|
};
|
||
|
}
|
||
|
|
||
|
if (typeof String.prototype.replaceAll != 'function') {
|
||
|
String.prototype.replaceAll = function(stringToFind,stringToReplace){
|
||
|
var temp = this;
|
||
|
var index = temp.indexOf(stringToFind);
|
||
|
while(index != -1){
|
||
|
temp = temp.replace(stringToFind,stringToReplace);
|
||
|
index = temp.indexOf(stringToFind);
|
||
|
}
|
||
|
return temp;
|
||
|
};
|
||
|
}
|
||
|
|
||
|
if (typeof String.prototype.startsWith != 'function') {
|
||
|
String.prototype.startsWith = function(str) {
|
||
|
return this.indexOf(str) == 0;
|
||
|
};
|
||
|
}
|
||
|
|
||
|
if (typeof String.prototype.endsWith != 'function') {
|
||
|
String.prototype.endsWith = function(suffix) {
|
||
|
return this.match(suffix+"$") == suffix;
|
||
|
};
|
||
|
}
|
||
|
|
||
|
if (typeof Array.prototype.indexOf != 'function') {
|
||
|
Array.prototype.indexOf = function(obj, fromIndex) {
|
||
|
if (fromIndex == null) {
|
||
|
fromIndex = 0;
|
||
|
} else if (fromIndex < 0) {
|
||
|
fromIndex = Math.max(0, this.length + fromIndex);
|
||
|
}
|
||
|
for ( var i = fromIndex, j = this.length; i < j; i++) {
|
||
|
if (this[i] === obj)
|
||
|
return i;
|
||
|
}
|
||
|
return -1;
|
||
|
};
|
||
|
}
|