fix: author and date published selectors (#189)

pull/225/head
Ralph Jbeily 5 years ago committed by Adam Pash
parent 41efd361b5
commit f3f6e21fd8

@ -2,14 +2,16 @@ version: 2
jobs:
test-node:
docker:
- image: circleci/node:8.10
- image: circleci/node:8.10-browsers
steps:
- checkout
# For some reason phantomjs-prebuild is failing w/yarn, but npm installing works
- run: "npm install phantomjs-prebuilt"
- run: "yarn install"
- run: "yarn lint:ci"
- run: "yarn build:ci"
- run: "./scripts/pr-parser-preview.sh"
- run: "yarn test:node --maxWorkers=4"
- run: "./scripts/pr-parser-preview.sh"
- store_artifacts:
path: tmp/artifacts
@ -20,7 +22,6 @@ jobs:
steps:
- checkout
- run: "yarn install"
# For some reason phantomjs-prebuild is failing w/yarn, but npm installing works
- run: "yarn add karma-cli --dev"
- run: "yarn test:web --maxWorkers=4"
- run: "yarn build:web:ci --maxWorkers=4"

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -6,11 +6,15 @@ export const WwwWashingtonpostComExtractor = {
},
author: {
selectors: ['.pb-byline'],
selectors: [
'.pb-author-name',
],
},
date_published: {
selectors: [['.pb-timestamp[itemprop="datePublished"]', 'content']],
selectors: [
['.author-timestamp[itemprop="datePublished"]', 'content'],
],
},
dek: {

@ -14,11 +14,11 @@ describe('WwwWashingtonpostComExtractor', () => {
let url;
beforeAll(() => {
url =
'https://www.washingtonpost.com/politics/trump-foundation-apparently-admits-to-violating-ban-on-self-dealing-new-filing-to-irs-shows/2016/11/22/893f6508-b0a9-11e6-8616-52b15787add0_story.html';
const html = fs.readFileSync(
'./fixtures/www.washingtonpost.com/1480364838420.html'
);
result = Mercury.parse(url, html, { fallback: false });
'https://www.washingtonpost.com/news/opinions/wp/2018/10/29/enough-platitudes-lets-name-names/';
const html =
fs.readFileSync('./fixtures/www.washingtonpost.com/1546958901450.html');
result =
Mercury.parse(url, html, { fallback: false });
});
it('is selected properly', async () => {
@ -36,10 +36,7 @@ describe('WwwWashingtonpostComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Trump Foundation admits to violating ban on self-dealing, new filing to IRS shows'
);
assert.equal(title, 'Enough platitudes: Lets name names');
});
it('returns the author', async () => {
@ -49,7 +46,7 @@ describe('WwwWashingtonpostComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(author, 'David A. Fahrenthold');
assert.equal(author, 'Jennifer Rubin');
});
it('returns the date_published', async () => {
@ -59,7 +56,7 @@ describe('WwwWashingtonpostComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-11-22T13:57:00.000Z');
assert.equal(date_published, '2018-10-29T15:15:00.000Z');
});
it('returns the lead_image_url', async () => {
@ -71,7 +68,7 @@ describe('WwwWashingtonpostComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://img.washingtonpost.com/rf/image_1484w/2010-2019/WashingtonPost/2016/11/01/Others/Images/2016-11-01/Trump-HomeSafe-News-131478026931.jpg'
'https://www.washingtonpost.com/resizer/E6j9aM5bx4fpPedpdl2KxcSIci4=/1484x0/arc-anglerfish-washpost-prod-washpost.s3.amazonaws.com/public/GRLSHYNYVQZJBAUBKSFA26NTO4.jpg'
);
});
@ -95,7 +92,7 @@ describe('WwwWashingtonpostComExtractor', () => {
// the article.
assert.equal(
first13,
'Painter Michael Israel, left, poses with Donald and Melania Trump in 2007 at'
'Pittsburgh Mayor Bill Peduto on Sunday near the Tree of Life synagogue in'
);
});
});

@ -55,6 +55,20 @@ describe('Mercury', () => {
assert.equal(typeof result, 'object');
});
it('does washingtonpost', async () => {
jasmine.DEFAULT_TIMEOUT_INTERVAL = 10000;
const result = await Mercury.parse(
'https://www.washingtonpost.com/news/opinions/wp/2018/10/29/enough-platitudes-lets-name-names/'
);
assert.equal(typeof result, 'object');
assert.equal(result.total_pages, 1);
assert.equal(
result.url,
'https://www.washingtonpost.com/news/opinions/wp/2018/10/29/enough-platitudes-lets-name-names/'
);
});
it('does the nyt', async () => {
const result = await Mercury.parse(
'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0'
@ -67,7 +81,7 @@ describe('Mercury', () => {
it('does ars pagination', async () => {
jasmine.DEFAULT_TIMEOUT_INTERVAL = 10000;
const url =
'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const result = await Mercury.parse(url, null, { fetchAllPages: true });
const { total_pages, pages_rendered } = result;

@ -89,7 +89,6 @@ export function baseDomain({ host }) {
export default async function fetchResource(url, parsedUrl) {
parsedUrl = parsedUrl || URL.parse(encodeURI(url));
const options = {
url: parsedUrl.href,
headers: { ...REQUEST_HEADERS },
@ -101,8 +100,10 @@ export default async function fetchResource(url, parsedUrl) {
encoding: null,
// Accept and decode gzip
gzip: true,
// Follow any redirect
// Follow any non-GET redirects
followAllRedirects: true,
// Follow GET redirects
followRedirect: true,
};
const { response, body } = await get(options);

Loading…
Cancel
Save