feat: Add a custom extractor for www.engadget.com. (#552)
* feat:Add a custom extractor for ma.ttias.be. When parsing content for cron.weekly issues, such as the one at https://ma.ttias.be/cronweekly/issue-130/, Mercury Parser would remove headings and ordered lists that were part of the content. This resolves that as follows: * Remove "id" attributes from "h1" and "h2" elements. Those attributes would result in the elements having a low weight. * Since Mercury Parser demotes "h1" elements to "h2", demote "h2" elements to "h3". * Add class="entry-content-asset" to "ul" elements to avoid them being removed. * removed redundant comment. * feat: Add a custom extractor for engadget.com. Co-authored-by: John Holdun <john@johnholdun.com>pull/553/head^2
parent
13dfe720bd
commit
3c5c0bdba9
File diff suppressed because one or more lines are too long
@ -0,0 +1,53 @@
|
||||
export const WwwEngadgetComExtractor = {
|
||||
domain: 'www.engadget.com',
|
||||
|
||||
title: {
|
||||
selectors: [['meta[name="og:title"]', 'value']],
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: ['a.th-meta[data-ylk*="subsec:author"]'],
|
||||
},
|
||||
|
||||
// Engadget stories have publish dates, but the only representation of them on the page
|
||||
// is in a format like "2h ago". There are also these tags with blank values:
|
||||
// <meta class="swiftype" name="published_at" data-type="date" value="">
|
||||
date_published: {
|
||||
selectors: [
|
||||
// enter selectors
|
||||
],
|
||||
},
|
||||
|
||||
dek: {
|
||||
selectors: ['div[class*="o-title_mark"] div'],
|
||||
},
|
||||
|
||||
// Engadget stories do have lead images specified by an og:image meta tag, but selecting
|
||||
// the value attribute of that tag fails. I believe the "ℑ" sequence of characters
|
||||
// is triggering this inability to select the attribute value.
|
||||
lead_image_url: {
|
||||
selectors: [
|
||||
// enter selectors
|
||||
],
|
||||
},
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
[
|
||||
// Some figures will be inside div.article-text, but some header figures/images
|
||||
// will not.
|
||||
'#page_body figure:not(div.article-text figure)',
|
||||
'div.article-text',
|
||||
],
|
||||
],
|
||||
|
||||
// Is there anything in the content you selected that needs transformed
|
||||
// before it's consumable content? E.g., unusual lazy loaded images
|
||||
transforms: {},
|
||||
|
||||
// Is there anything that is in the result that shouldn't be?
|
||||
// The clean selectors will remove anything that matches from
|
||||
// the result
|
||||
clean: [],
|
||||
},
|
||||
};
|
Loading…
Reference in New Issue