器→工具, 工具软件, 开源项目, 数据, 术→技巧, 研发

网页正文提取工具Readability

钱魏Way · · 0 次浏览

什么是Readability?

如果你在网上看到一篇不错的文章想收藏起来稍后再读,却又不想同时保存那一大堆花花绿绿的广告或无关紧要的一些网页元素,那么你可以试试 Readability 了!

Readability 是一个颇有特色的“稍后阅读”网络收藏夹服务,除了在你看到喜欢的文章时可以收藏下来之外,它最大的特点在于它能自动智能地剔除网页上一些不重要的元素并重新排版,仅为你呈现干净整洁的正文部分,使你的阅读体验更佳!它除了拥有主流浏览器的插件之外,还提供了 iOS/Android/Kindle 等移动版本的应用,可以同步到手机上随时随地高效舒适地阅读……

Readability的实现原理

从网页中提取出主要内容,一直是一个比较有挑战的算法。

Readability之前是开源的,后来不再公开了,这里还有当初的开源版本可以参考:arc90labs-readability – Readability cleans up hard-to-read articles on the Web

Readability通过遍历Dom对象,通过标签和常用文字的加减权,来重新整合出页面的内容。接下来我们就简单看看这个算法是如何实现的。首先,它定义了一系列正则:

regexps: {
        unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
        okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
        positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
        negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
        extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
        divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
        replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
        replaceFonts:          /<(\/?)font[^>]*>/gi,
        trim:                  /^\s+|\s+$/g,
        normalize:             /\s{2,}/g,
        killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
        videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
        skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
        nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
        prevLink:              /(prev|earl|old|new|<|«)/i
},

可以看到,标签和文字都有加权或降权分组。整个内容分析是通过grabArticle函数来实现的。首先开始遍历节点:

for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1)

然后将不像内容的元素去掉:

if (stripUnlikelyCandidates) 
{
    var unlikelyMatchString = node.className + node.id;
    if (
        (
            unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
            unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
            node.tagName !== "BODY"
        )
    )
    {
        dbg("Removing unlikely candidate - " + unlikelyMatchString);
        node.parentNode.removeChild(node);
        nodeIndex-=1;
        continue;
    }               
}

将DIV替换为P标签后,再对目标节点进行遍历,进行计分:

var candidates = [];
for (var pt=0; pt < nodesToScore.length; pt+=1) {
    var parentNode      = nodesToScore[pt].parentNode;
    var grandParentNode = parentNode ? parentNode.parentNode : null;
    var innerText       = readability.getInnerText(nodesToScore[pt]);

    if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
        continue;
    }

    /* If this paragraph is less than 25 characters, don't even count it. */
    if(innerText.length < 25) {
        continue; }

    /* Initialize readability data for the parent. */
    if(typeof parentNode.readability === 'undefined') {
        readability.initializeNode(parentNode);
        candidates.push(parentNode);
    }

    /* Initialize readability data for the grandparent. */
    if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
        readability.initializeNode(grandParentNode);
        candidates.push(grandParentNode);
    }

    var contentScore = 0;

    /* Add a point for the paragraph itself as a base. */
    contentScore+=1;

    /* Add points for any commas within this paragraph */
    contentScore += innerText.split(',').length;
    
    /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
    contentScore += Math.min(Math.floor(innerText.length / 100), 3);
    
    /* Add the score to the parent. The grandparent gets half. */
    parentNode.readability.contentScore += contentScore;

    if(grandParentNode) {
        grandParentNode.readability.contentScore += contentScore/2;             
    }
}

最后根据分值,重新拼接内容:

var articleContent        = document.createElement("DIV");
if (isPaging) {
    articleContent.id     = "readability-content";
}
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes          = topCandidate.parentNode.childNodes;


for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
    var siblingNode = siblingNodes[s];
    var append      = false;

    /**
     * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
     * Example of error visible here: http://www.esquire.com/features/honesty0707
    **/
    if(!siblingNode) {
        continue;
    }

    dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
    dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));

    if(siblingNode === topCandidate)
    {
        append = true;
    }

    var contentBonus = 0;
    /* Give a bonus if sibling nodes and top candidates have the example same classname */
    if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
        contentBonus += topCandidate.readability.contentScore * 0.2;
    }

    if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
    {
        append = true;
    }
    
    if(siblingNode.nodeName === "P") {
        var linkDensity = readability.getLinkDensity(siblingNode);
        var nodeContent = readability.getInnerText(siblingNode);
        var nodeLength  = nodeContent.length;
        
        if(nodeLength > 80 && linkDensity < 0.25)
        {
            append = true;
        }
        else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
        {
            append = true;
        }
    }

    if(append) {
        dbg("Appending node: " + siblingNode);

        var nodeToAppend = null;
        if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
            /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
            
            dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
            nodeToAppend = document.createElement("DIV");
            try {
                nodeToAppend.id = siblingNode.id;
                nodeToAppend.innerHTML = siblingNode.innerHTML;
            }
            catch(er) {
                dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
                nodeToAppend = siblingNode;
                s-=1;
                sl-=1;
            }
        } else {
            nodeToAppend = siblingNode;
            s-=1;
            sl-=1;
        }
        
        /* To ensure a node does not interfere with readability styles, remove its classnames */
        nodeToAppend.className = "";

        /* Append sibling and subtract from our list because it removes the node when you append to another node */
        articleContent.appendChild(nodeToAppend);
    }
}

可以看到,里边用到了很多很trick的技巧,比如25字以下的段落不计分。目前已经有很多其他语言的版本,但是考虑到原始功能是针对的英文页面进行设计的。在移植过程中需要考虑:

支持GBK, GB2312等编码

支持更多的视频网站,比如优酷、土豆等

支持提取文章标题

支持图片相对路径转成绝对路径

如何快速使用Readability?

目前收集到的比较好的移植版本为:

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注