fresh run of prettier; remove NOTES.md (#233)

pull/234/head
Adam Pash 5 years ago committed by GitHub
parent 244d17ddd3
commit 663cc45bf4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,84 +0,0 @@
Each extractor should ultimately be an object that exports like so:
```javascript
import GenericContentExtractor from './content/extractor'
import GenericTitleExtractor from './title/extractor'
import GenericAuthorExtractor from './author/extractor'
import GenericDatePublishedExtractor from './date-published/extractor'
import GenericDekExtractor from './dek/extractor'
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
const GenericExtractor = {
content: GenericContentExtractor,
title: GenericTitleExtractor,
author: GenericAuthorExtractor,
datePublished: GenericDatePublishedExtractor,
dek: GenericDekExtractor,
leadImageUrl: GenericLeadImageUrlExtractor,
}
```
Custom parsers can then be merged with the generic parser to fill in gaps in their implementations. E.g:
```javascript
import NYMagContentExtractor from '...'
import NYMagTitleExtractor from '...'
const NYMagExtractor = {
content: NYMagContentExtractor,
title: NYMagTitleExtractor,
}
const Extractor = {
...GenericExtractor,
...NYMagExtractor
}
```
# Declarative Custom Extractors
My goal is be to create declarative extractors that describe what rather than how. So, for example:
```javascript
NYMagExtractor = {
content: {
// Order by most likely. Extractor will stop on first occurrence
selectors: [
'div.article-content',
'section.body',
'article.article',
],
// Selectors to remove from the extracted content
clean: [
'.ad',
],
// Array of tranformations to make on matched elements
// Each item in the array is an object. They key is the
// selector, the value is a tranformation function
// for the matching node.
transforms: [
// Convert h1s to h2s
{
'h1': ($node) => convertNodeTo($node, $, 'h2')
},
// Convert lazy-loaded noscript images to figures
{
'noscript': ($node) => {
const $children = $node.children()
if ($children.length === 1 && $children.get(0).tagName === 'img') {
convertNodeTo($node, $, 'figure')
}
}
}
]
},
title: [
'h1',
]
}
```

@ -1,6 +1,6 @@
import template from './index';
export default function (hostname, name) {
export default function(hostname, name) {
return template`
export const ${name} = {
domain: '${hostname}',

@ -13,9 +13,10 @@ export default function template(strings, ...values) {
indentLevel = /^\s{0,2}(.+)$/g;
}
return body.split('\n')
return body
.split('\n')
.slice(1)
.map((line) => {
.map(line => {
line = line.replace(indentLevel, '$1');
if (trailingWhitespace.test(line)) {

@ -1,9 +1,7 @@
import assert from 'assert';
import moment from 'moment-timezone';
import cleanDatePublished, {
cleanDateString,
} from './date-published';
import cleanDatePublished, { cleanDateString } from './date-published';
describe('cleanDatePublished(dateString)', () => {
it('returns a date', () => {

@ -2,38 +2,26 @@ export const WwwFastcompanyComExtractor = {
domain: 'www.fastcompany.com',
title: {
selectors: [
'h1',
],
selectors: ['h1'],
},
author: {
selectors: [
'.post__by',
],
selectors: ['.post__by'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
selectors: [['meta[name="article:published_time"]', 'value']],
},
dek: {
selectors: [
'.post__deck',
],
selectors: ['.post__deck'],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: [
'.post__article',
],
selectors: ['.post__article'],
},
};

@ -15,10 +15,10 @@ describe('WwwFastcompanyComExtractor', () => {
beforeAll(() => {
url =
'https://www.fastcompany.com/3067012/the-only-five-email-folders-your-inbox-will-ever-need';
const html =
fs.readFileSync('./fixtures/www.fastcompany.com/1547124373499.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.fastcompany.com/1547124373499.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', () => {
@ -30,53 +30,62 @@ describe('WwwFastcompanyComExtractor', () => {
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(title, 'The Only Five Email Folders Your Inbox Will Ever Need');
// Update these values with the expected values from
// the article.
assert.equal(
title,
'The Only Five Email Folders Your Inbox Will Ever Need'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(author, 'Zach Hanlon');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2017-01-09T05:00:00.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Stop “organizing” your emails by subject and start thinking of them in terms of deadlines.');
// Update these values with the expected values from
// the article.
assert.equal(
dek,
'Stop “organizing” your emails by subject and start thinking of them in terms of deadlines.'
);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.fastcompany.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://images.fastcompany.net/image/upload/w_1280,f_auto,q_auto,fl_lossy/fc/3067012-poster-p-1-the-only-five-email-folders-your-inbox-will-ever-need.jpg');
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'https://images.fastcompany.net/image/upload/w_1280,f_auto,q_auto,fl_lossy/fc/3067012-poster-p-1-the-only-five-email-folders-your-inbox-will-ever-need.jpg'
);
});
it('returns the content', async () => {
@ -88,11 +97,19 @@ describe('WwwFastcompanyComExtractor', () => {
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'For years, my approach to email was like slaying a hydra. For every');
assert.equal(
first13,
'For years, my approach to email was like slaying a hydra. For every'
);
});
});
});

@ -18,10 +18,12 @@ export const WwwFortinetComExtractor = {
},
content: {
selectors: ['div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12'],
selectors: [
'div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12',
],
transforms: {
noscript: ($node) => {
noscript: $node => {
const $children = $node.children();
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure';

@ -15,7 +15,9 @@ describe('WwwFortinetComExtractor', () => {
beforeAll(() => {
url =
'https://www.fortinet.com/blog/threat-research/defeating-an-android-packer-with-frida.html';
const html = fs.readFileSync('./fixtures/www.fortinet.com/1546954846985.html');
const html = fs.readFileSync(
'./fixtures/www.fortinet.com/1546954846985.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
@ -34,7 +36,10 @@ describe('WwwFortinetComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, 'How-to Guide: Defeating an Android Packer with FRIDA');
assert.equal(
title,
'How-to Guide: Defeating an Android Packer with FRIDA'
);
});
it('returns the author', async () => {

@ -6,15 +6,11 @@ export const WwwWashingtonpostComExtractor = {
},
author: {
selectors: [
'.pb-author-name',
],
selectors: ['.pb-author-name'],
},
date_published: {
selectors: [
['.author-timestamp[itemprop="datePublished"]', 'content'],
],
selectors: [['.author-timestamp[itemprop="datePublished"]', 'content']],
},
dek: {

@ -15,10 +15,10 @@ describe('WwwWashingtonpostComExtractor', () => {
beforeAll(() => {
url =
'https://www.washingtonpost.com/news/opinions/wp/2018/10/29/enough-platitudes-lets-name-names/';
const html =
fs.readFileSync('./fixtures/www.washingtonpost.com/1546958901450.html');
result =
Mercury.parse(url, html, { fallback: false });
const html = fs.readFileSync(
'./fixtures/www.washingtonpost.com/1546958901450.html'
);
result = Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {

@ -1,4 +1,4 @@
import { MediumExtractor, BloggerExtractor } from "./custom";
import { MediumExtractor, BloggerExtractor } from './custom';
const Detectors = {
'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,

@ -57,13 +57,15 @@ export default function mergeSiblings($candidate, topScore, $) {
if (newScore >= siblingScoreThreshold) {
return wrappingDiv.append($sibling);
} if (sibling.tagName === 'p') {
}
if (sibling.tagName === 'p') {
const siblingContent = $sibling.text();
const siblingContentLength = textLength(siblingContent);
if (siblingContentLength > 80 && density < 0.25) {
return wrappingDiv.append($sibling);
} if (
}
if (
siblingContentLength <= 80 &&
density === 0 &&
hasSentenceEnd(siblingContent)

@ -15,13 +15,17 @@ export default function scoreNode($node) {
// Could save doing that regex test on every node AP
if (PARAGRAPH_SCORE_TAGS.test(tagName)) {
return scoreParagraph($node);
} if (tagName.toLowerCase() === 'div') {
}
if (tagName.toLowerCase() === 'div') {
return 5;
} if (CHILD_CONTENT_TAGS.test(tagName)) {
}
if (CHILD_CONTENT_TAGS.test(tagName)) {
return 3;
} if (BAD_TAGS.test(tagName)) {
}
if (BAD_TAGS.test(tagName)) {
return -3;
} if (tagName.toLowerCase() === 'th') {
}
if (tagName.toLowerCase() === 'th') {
return -5;
}

@ -2,10 +2,7 @@ import assert from 'assert';
import URL from 'url';
import { record } from 'test-helpers';
import fetchResource, {
baseDomain,
validateResponse,
} from './fetch-resource';
import fetchResource, { baseDomain, validateResponse } from './fetch-resource';
import { MAX_CONTENT_LENGTH } from './constants';
describe('fetchResource(url)', () => {

@ -77,7 +77,6 @@ function removeUnlessContent($node, $, weight) {
// Too many script tags, not enough content.
if (scriptCount > 0 && contentLength < 150) {
$node.remove();
}
}
}

@ -13,7 +13,8 @@ export function linkDensity($node) {
if (totalTextLength > 0) {
return linkLength / totalTextLength;
} if (totalTextLength === 0 && linkLength > 0) {
}
if (totalTextLength === 0 && linkLength > 0) {
return 1;
}

@ -21,7 +21,8 @@ export default function stripUnlikelyCandidates($) {
const classAndId = `${classes || ''} ${id || ''}`;
if (CANDIDATES_WHITELIST.test(classAndId)) {
return;
} if (CANDIDATES_BLACKLIST.test(classAndId)) {
}
if (CANDIDATES_BLACKLIST.test(classAndId)) {
$node.remove();
}
});

Loading…
Cancel
Save