Added custom extractor for www.spektrum.de (#677)

Co-authored-by: Simon Reinhardt <simon.reinhardt@hype.de> Co-authored-by: John Holdun <john@johnholdun.com>
2 years ago · 035aa65dbc
parent f259d13753
commit 035aa65dbc
4 changed files with 563 additions and 0 deletions
--- a/fixtures/www.spektrum.de/1658928034778.html
+++ b/fixtures/www.spektrum.de/1658928034778.html
--- a/src/extractors/custom/index.js
+++ b/src/extractors/custom/index.js
@ -141,3 +141,4 @@ export * from './www.gruene.de';
 export * from './www.engadget.com';
 export * from './arstechnica.com';
 export * from './www.ndtv.com';
+export * from './www.spektrum.de';
--- a/src/extractors/custom/www.spektrum.de/index.js
+++ b/src/extractors/custom/www.spektrum.de/index.js
@ -0,0 +1,47 @@
+export const SpektrumExtractor = {
+  domain: 'www.spektrum.de',
+
+  title: {
+    selectors: ['.content__title'],
+  },
+
+  author: {
+    selectors: ['.content__author__info__name'],
+  },
+
+  date_published: {
+    selectors: ['.content__meta__date'],
+    timezone: 'Europe/Berlin',
+  },
+
+  dek: {
+    selectors: ['.content__intro'],
+  },
+
+  lead_image_url: {
+    selectors: [
+      // This is how the meta tag appears in the original source code.
+      ['meta[name="og:image"]', 'value'],
+      // This is how the meta tag appears in the DOM in Chrome.
+      // The selector is included here to make the code work within the browser as well.
+      ['meta[property="og:image"]', 'content'],
+      // This is the image that is shown on the page.
+      // It can be slightly cropped compared to the original in the meta tag.
+      '.image__article__top img',
+    ],
+  },
+
+  content: {
+    selectors: ['article.content'],
+    clean: [
+      '.breadcrumbs',
+      '.hide-for-print',
+      'aside',
+      'header h2',
+      '.image__article__top',
+      '.content__author',
+      '.copyright',
+      '.callout-box',
+    ],
+  },
+};
--- a/src/extractors/custom/www.spektrum.de/index.test.js
+++ b/src/extractors/custom/www.spektrum.de/index.test.js
@ -0,0 +1,87 @@
+import assert from 'assert';
+import URL from 'url';
+import cheerio from 'cheerio';
+
+import Mercury from 'mercury';
+import getExtractor from 'extractors/get-extractor';
+import { excerptContent } from 'utils/text';
+
+const fs = require('fs');
+
+describe('SpektrumExtractor', () => {
+  describe('initial test case', () => {
+    let result;
+    let url;
+    beforeAll(() => {
+      url =
+        'https://www.spektrum.de/news/genetik-das-geheimnis-der-parasitischen-rafflesien/2039026';
+      const html = fs.readFileSync(
+        './fixtures/www.spektrum.de/1658928034778.html'
+      );
+      result = Mercury.parse(url, { html, fallback: false });
+    });
+
+    it('is selected properly', () => {
+      const extractor = getExtractor(url);
+      assert.equal(extractor.domain, URL.parse(url).hostname);
+    });
+
+    it('returns the title', async () => {
+      const { title } = await result;
+      assert.equal(title, 'Das Geheimnis der parasitischen Riesenblumen');
+    });
+
+    it('returns the author', async () => {
+      const { author } = await result;
+      assert.equal(author, 'Christie Wilcox');
+    });
+
+    it('returns the date_published', async () => {
+      const { date_published } = await result;
+      // The article has '19.07.2022' which Mercury parses correctly as 0:00 CEST on the 19th of July
+      // (we help it along by providing a timezone in our extractor that should match the location of the website)
+      // and then converts to a UTC ISO string, so the result is 10 pm on the day before.
+      // See also https://github.com/postlight/mercury-parser/issues/549
+      assert.equal(date_published, '2022-07-18T22:00:00.000Z');
+    });
+
+    it('returns the dek', async () => {
+      const { dek } = await result;
+      assert.equal(
+        dek,
+        'Das bizarre Genom der größten Blütenpflanze der Welt offenbart, zu was Parasiten alles fähig sind: Sie stehlen, ' +
+          'löschen und duplizieren DNA und manipulieren vielleicht sogar die Gene ihres Wirts. Etliche Details sind aber noch ungeklärt.'
+      );
+    });
+
+    it('returns the lead_image_url', async () => {
+      const { lead_image_url } = await result;
+      assert.equal(
+        lead_image_url,
+        'https://static.spektrum.de/fm/912/f1920x1080/Rafflesia-arnoldii_AdobeStock_317147924_Maizal.jpeg'
+      );
+    });
+
+    it('returns the content', async () => {
+      const { content } = await result;
+
+      const $ = cheerio.load(content || '');
+
+      const first13 = excerptContent(
+        $('*')
+          .first()
+          .text(),
+        13
+      );
+
+      // This would be the true beginning of the content. But since we have to include the dek in the content
+      // in order to be able to find it (see https://github.com/postlight/mercury-parser/issues/676),
+      // the content will start with the dek instead.
+      // assert.equal(first13, 'Auf den ersten Blick sind sie unsichtbar. In ihrer Heimat, den Wäldern Südostasiens,');
+      assert.equal(
+        first13,
+        'Das bizarre Genom der größten Blütenpflanze der Welt offenbart, zu was Parasiten alles'
+      );
+    });
+  });
+});