如果你在网上看到一篇不错的文章想收藏起来稍后再读,却又不想同时保存那一大堆花花绿绿的广告或无关紧要的一些网页元素,那么你可以试试 Readability 了!
Readability 是一个颇有特色的“稍后阅读”网络收藏夹服务,除了在你看到喜欢的文章时可以收藏下来之外,它最大的特点在于它能自动智能地剔除网页上一些不重要的元素并重新排版,仅为你呈现干净整洁的正文部分,使你的阅读体验更佳!它除了拥有主流浏览器的插件之外,还提供了 iOS/Android/Kindle 等移动版本的应用,可以同步到手机上随时随地高效舒适地阅读……
Readability之前是开源的,后来不再公开了,这里还有当初的开源版本可以参考:arc90labs-readability – Readability cleans up hard-to-read articles on the Web。
regexps: { unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, okMaybeItsACandidate: /and|article|body|column|main|shadow/i, positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i, divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, replaceFonts: /<(\/?)font[^>]*>/gi, trim: /^\s+|\s+$/g, normalize: /\s{2,}/g, killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. prevLink: /(prev|earl|old|new|<|«)/i },
for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1)
if (stripUnlikelyCandidates) { var unlikelyMatchString = node.className + node.id; if ( ( unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 && unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 && node.tagName !== "BODY" ) ) { dbg("Removing unlikely candidate - " + unlikelyMatchString); node.parentNode.removeChild(node); nodeIndex-=1; continue; } }
var candidates = []; for (var pt=0; pt < nodesToScore.length; pt+=1) { var parentNode = nodesToScore[pt].parentNode; var grandParentNode = parentNode ? parentNode.parentNode : null; var innerText = readability.getInnerText(nodesToScore[pt]); if(!parentNode || typeof(parentNode.tagName) === 'undefined') { continue; } /* If this paragraph is less than 25 characters, don't even count it. */ if(innerText.length < 25) { continue; } /* Initialize readability data for the parent. */ if(typeof parentNode.readability === 'undefined') { readability.initializeNode(parentNode); candidates.push(parentNode); } /* Initialize readability data for the grandparent. */ if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') { readability.initializeNode(grandParentNode); candidates.push(grandParentNode); } var contentScore = 0; /* Add a point for the paragraph itself as a base. */ contentScore+=1; /* Add points for any commas within this paragraph */ contentScore += innerText.split(',').length; /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ contentScore += Math.min(Math.floor(innerText.length / 100), 3); /* Add the score to the parent. The grandparent gets half. */ parentNode.readability.contentScore += contentScore; if(grandParentNode) { grandParentNode.readability.contentScore += contentScore/2; } }
var articleContent = document.createElement("DIV"); if (isPaging) { articleContent.id = "readability-content"; } var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); var siblingNodes = topCandidate.parentNode.childNodes; for(var s=0, sl=siblingNodes.length; s < sl; s+=1) { var siblingNode = siblingNodes[s]; var append = false; /** * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList. * Example of error visible here: http://www.esquire.com/features/honesty0707 **/ if(!siblingNode) { continue; } dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); if(siblingNode === topCandidate) { append = true; } var contentBonus = 0; /* Give a bonus if sibling nodes and top candidates have the example same classname */ if(siblingNode.className === topCandidate.className && topCandidate.className !== "") { contentBonus += topCandidate.readability.contentScore * 0.2; } if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) { append = true; } if(siblingNode.nodeName === "P") { var linkDensity = readability.getLinkDensity(siblingNode); var nodeContent = readability.getInnerText(siblingNode); var nodeLength = nodeContent.length; if(nodeLength > 80 && linkDensity < 0.25) { append = true; } else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { append = true; } } if(append) { dbg("Appending node: " + siblingNode); var nodeToAppend = null; if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); nodeToAppend = document.createElement("DIV"); try { nodeToAppend.id = siblingNode.id; nodeToAppend.innerHTML = siblingNode.innerHTML; } catch(er) { dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); nodeToAppend = siblingNode; s-=1; sl-=1; } } else { nodeToAppend = siblingNode; s-=1; sl-=1; } /* To ensure a node does not interfere with readability styles, remove its classnames */ nodeToAppend.className = ""; /* Append sibling and subtract from our list because it removes the node when you append to another node */ articleContent.appendChild(nodeToAppend); } }
支持GBK, GB2312等编码
- nodejs版本:https://github.com/luin/readability
- php 版本:https://github.com/feelinglucky/php-readability
- Python版本:https://github.com/timbertson/python-readability
- JS版本:https://github.com/mozilla/readability
- Swift版本:https://github.com/exyte/ReadabilityKit
- java版本:https://github.com/karussell/snacktory
- Ruby版本:https://github.com/cantino/ruby-readability