From e217648c0be8fb48f40033e43da2134e71266c56 Mon Sep 17 00:00:00 2001 From: John Brayton Date: Mon, 9 May 2022 12:07:27 -0400 Subject: [PATCH] feat: ma.ttias.be extractor (#551) * feat:Add a custom extractor for ma.ttias.be. When parsing content for cron.weekly issues, such as the one at https://ma.ttias.be/cronweekly/issue-130/, Mercury Parser would remove headings and ordered lists that were part of the content. This resolves that as follows: * Remove "id" attributes from "h1" and "h2" elements. Those attributes would result in the elements having a low weight. * Since Mercury Parser demotes "h1" elements to "h2", demote "h2" elements to "h3". * Add class="entry-content-asset" to "ul" elements to avoid them being removed. * removed redundant comment. Co-authored-by: John Holdun --- fixtures/ma.ttias.be/1587659928239.html | 540 ++++++++++++++++++ src/extractors/custom/index.js | 1 + src/extractors/custom/ma.ttias.be/index.js | 46 ++ .../custom/ma.ttias.be/index.test.js | 125 ++++ 4 files changed, 712 insertions(+) create mode 100644 fixtures/ma.ttias.be/1587659928239.html create mode 100644 src/extractors/custom/ma.ttias.be/index.js create mode 100644 src/extractors/custom/ma.ttias.be/index.test.js diff --git a/fixtures/ma.ttias.be/1587659928239.html b/fixtures/ma.ttias.be/1587659928239.html new file mode 100644 index 00000000..03515ab6 --- /dev/null +++ b/fixtures/ma.ttias.be/1587659928239.html @@ -0,0 +1,540 @@ + + + + + + + cron.weekly issue #130: Github, keycloak, proc, redis6, cron & more + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+

+ cron.weekly issue #130: Github, keycloak, proc, redis6, cron & more +

+ +
+ +
+

+ cron.weekly is a newsletter about Linux, open source & webdevelopment. Want to get it in your inbox every Sunday? Subscribe below! +

+ +
+ + +
+ + I respect your privacy and you won't get spam. Ever. Just a weekly-ish newsletter about Linux and open source. + +
+
+
+
+
+
+ +
+
+
+ Image of Mattias Geniar +
+

+ Mattias Geniar, April 19, 2020 +

+

+ Follow me on Twitter as @mattiasgeniar +

+
+
+ + + + +
+
+ + +

Hi everyone! 👋

+ +

Welcome to cron.weekly issue #130.

+ +

There’s quite a bit of news from Github this week, together with Windows 10 (I know, on a Linux-focussed newsletter no less!) and ARM servers. I should have some content for everyone to enjoy.

+ +

Stay strong, stay home, stay alive.

+ +

Oh, and ☕️, of course.

+ +

News & general 🗞

+ +

Is BGP safe yet? No.

+ +

Did you know the internet is held together with duct tape? BGP is one of those pieces of tape. Resource Public Key Infrastructure (RPKI) can help authenticate routes, but is only seeing slow adoption.

+ +

GitHub is now free for teams

+ +

This is a pretty big move on their part: you can now create teams and host unlimited private git repositories at Github for free!

+ +
+

We’ve wanted to make this change for the last 18 months, but needed our Enterprise business to be big enough to enable the free use of GitHub by the rest of the world. - Nat Friedman

+
+ +

Big players pay for the little ones, I appreciate that.

+ +

Academics steal data from air-gapped systems using PC fan vibrations

+ +

Even your computer fan can leak your personal data. tl;dr: don’t use computers. Ever.

+ +

ICANN delays .org sell off

+ +

Good news, for now - as the .org sale announced late last year is put on hold. Question is, for how long?

+ +

npm has joined GitHub

+ +

GitHub has completed its acquisition of npm. Which is a bit of a weird announcement from Github, as npm already announced it last month? Oh well, corporate stuff. 😄

+ +

Scaleway ARM64 instances reach end-of-life

+ +

Scaleway is sunsetting its ARM64 instances at the end of this year. This is a bit of a surprise move, as they reached their fame (I believe?) mostly because they did ARM server hosting at scale.

+ +

In an age where Apple is supposed to release ARM laptops within the year, I’d be very curious to learn why Scaleway is terminating their ARM support.

+ +

Windows WSL2 & Explorer integration for Linux

+ +

Two interesting stories that caught my attention last week from the Windows front:

+ + + +

The new Windows Subsystem for Linux 2 (WSL2) will run as a lightweight VM running a full Linux kernel, which means it’ll support things like Docker and FUSE.

+ +

The new Explorer integration means you’ll be able to access the files on that tiny Linux VM much easier.

+ +

Tools & Projects 🛠

+ +

Learning-to-See-in-the-Dark

+ +

This is crazy: a machine learning dataset that can turn just about any (really) dark image into a bright-as-day version. The video blew my mind.

+ +

ProtonMail Bridge

+ +

The ProtonMail Bridge is an application that runs on your computer in the background and seamlessly encrypts and decrypts your mail as it enters and leaves your computer.

+ +

Simplify Complexity in Containerized Environments

+ +

Enhance visibility into containers and container orchestration with Datadog. Automatically track containerized services with Autodiscovery and receive smarter alerts that won’t panic as customers scale down. Easily monitor the health of all your containers with granular, real-time metrics and visualize performance from a bird’s eye view with Datadog’s live container map. Start your free Datadog trial today! Sponsored

+ +

Phoenix LiveDashboard

+ +

LiveDashboard provides real-time performance monitoring and debugging tools for Phoenix developers.

+ +

prestissimo

+ +

This is a composer plugin (PHP’s package manager) that downloads packages in parallel to speed up the installation process.

+ +

Kanboard

+ +

Kanboard is a free and open-source Kanban project management software.

+ +

Hund: versatile service monitoring

+ +

Monitor your services every 30 seconds, get notified the way you want! Slack, Email, Webhooks, … your pick. See rich metrics and use our customizable status pages to keep all your customers informed. Try us for free for 30 days! Sponsored

+ +

regex2fat

+ +

“Did you ever want to match a regex, but all you had was a fat32 driver? Ever wanted to serialize your regex DFAs into one of the most widely supported formats used by over 3 billion devices?”

+ +

I have no idea how any of this works, but it got shared far & wide on the internet, so I figure I’ll do the same here. But really … even the tagline confuses me. 🙈

+ +

falcon

+ +

Falcon is a free, open-source SQL editor with inline data visualization. It currently supports connecting to RedShift, MySQL, PostgreSQL, IBM DB2, Impala, MS SQL, Oracle, SQLite.

+ +

keycloak

+ +

Keycloak is an Open Source Identity and Access Management solution for modern Applications and Services. It allows you to add authentication to applications and secure services with minimum fuss. No need to deal with storing users or authenticating users. It’s all available out of the box. You’ll even get advanced features such as User Federation, Identity Brokering and Social Login.

+ +

mdBook

+ +

Ever wanted to write a book (You’re crazy)? Ever wanted to do it in Markdown (OK, maybe you’re not crazy)? You can use mdBook to take Markdown files, parse them & create online books.

+ +

3mux

+ +

3mux is a terminal multiplexer with out-of-the-box support for search, mouse-controlled scrollback, and i3-like keybindings. Imagine tmux with a smaller learning curve and more sane defaults.

+ +

Guides & Tutorials 🎓

+ +

Now I Understand why Almost No One uses Encrypted Email

+ +

It’s true, using PGP with email isn’t very user friendly, to this day. This post contains the commands used to send encrypted e-mails, and it’s enough to put you off - honestly.

+ +

Troubleshoot using the proc filesystem on Linux

+ +

A good overview of the basics when looking at the /proc filesystem. I use this all the time when debugging, it’s a good place to start a lot of debug-quests.

+ +

Comparing the new Redis6 multithreaded I/O to Elasticache & KeyDB

+ +

The team at KeyDB compares its key/value daemon (originally a fork of Redis) with the latest Redis 6. Surprise surprise, KeyDB wins. 😅 But the metrics & numbers are interesting enough to share the post regardless.

+ +

Why does cron only offer minute granularity?

+ +

In short, compatibility. The format that crontab uses is described in minute detail as part of the POSIX Specification.

+ +

How to Boost UDP Transaction Performance

+ +

A nice in-depth set of slides to optimize UDP traffic on a Linux server. This might come in handy when HTTP/3 takes off and you want to optimise your throughput.

+ +

Rename files in linux without typing the full name twice

+ +

This is clever use of a Bash alias and the read method in Bash to prompt for input.

+ +

Technical reasons to choose FreeBSD over GNU/Linux

+ +

This post covers some of the technical reasons to choose FreeBSD over GNU/Linux.

+ +

Wireguard VPN: Typical Setup

+ +

This guide helps you set up Wireguard VPN on your Linux box with step-by-step instructions.

+ +
+ + +
+ + + + + + +
+ +
+
+ +
+
+ +
+ +
+
+

Want to subscribe to the cron.weekly newsletter?

+ +

+ I write a weekly-ish newsletter on Linux, open source & webdevelopment called cron.weekly. +

+ +

+ It features the latest news, guides & tutorials and new open source projects. You can sign up via email below. +

+ +
+ +
+ +

+ No spam. Just some good, practical Linux & open source content. +

+
+
+
+
+ + +
+ + + diff --git a/src/extractors/custom/index.js b/src/extractors/custom/index.js index 8325876d..86355935 100644 --- a/src/extractors/custom/index.js +++ b/src/extractors/custom/index.js @@ -134,3 +134,4 @@ export * from './biorxiv.org'; export * from './epaper.zeit.de'; export * from './www.ladbible.com'; export * from './timesofindia.indiatimes.com'; +export * from './ma.ttias.be'; \ No newline at end of file diff --git a/src/extractors/custom/ma.ttias.be/index.js b/src/extractors/custom/ma.ttias.be/index.js new file mode 100644 index 00000000..51f0cefd --- /dev/null +++ b/src/extractors/custom/ma.ttias.be/index.js @@ -0,0 +1,46 @@ +export const MaTtiasBeExtractor = { + domain: 'ma.ttias.be', + + title: { + selectors: [['meta[name="twitter:title"]', 'value']], + }, + + author: { + selectors: [['meta[name="author"]', 'value']], + }, + + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']], + }, + + content: { + selectors: [['.content']], + + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + h2: $node => { + // The "id" attribute values would result in low scores and the element being + // removed. + $node.attr('id', null); + + // h1 elements will be demoted to h2, so demote h2 elements to h3. + return 'h3'; + }, + h1: $node => { + // The "id" attribute values would result in low scores and the element being + // removed. + $node.attr('id', null); + + // A subsequent h2 will be removed if there is not a paragraph before it, so + // add a paragraph here. It will be removed anyway because it is empty. + $node.after('

'); + }, + ul: $node => { + // Articles contain lists of links which look like, but are not, navigation + // elements. Adding this class attribute avoids them being incorrectly removed. + $node.attr('class', 'entry-content-asset'); + }, + }, + }, +}; diff --git a/src/extractors/custom/ma.ttias.be/index.test.js b/src/extractors/custom/ma.ttias.be/index.test.js new file mode 100644 index 00000000..9aa73b20 --- /dev/null +++ b/src/extractors/custom/ma.ttias.be/index.test.js @@ -0,0 +1,125 @@ +import assert from 'assert'; +import URL from 'url'; +import cheerio from 'cheerio'; + +import Mercury from 'mercury'; +import getExtractor from 'extractors/get-extractor'; +import { excerptContent } from 'utils/text'; + +const fs = require('fs'); + +describe('MaTtiasBeExtractor', () => { + describe('initial test case', () => { + let result; + let url; + beforeAll(() => { + url = 'https://ma.ttias.be/cronweekly/issue-130/'; + const html = fs.readFileSync('./fixtures/ma.ttias.be/1587659928239.html'); + result = Mercury.parse(url, { html, fallback: false }); + }); + + it('is selected properly', () => { + // This test should be passing by default. + // It sanity checks that the correct parser + // is being selected for URLs from this domain + const extractor = getExtractor(url); + assert.equal(extractor.domain, URL.parse(url).hostname); + }); + + it('returns the title', async () => { + // To pass this test, fill out the title selector + // in ./src/extractors/custom/ma.ttias.be/index.js. + const { title } = await result; + + // Update these values with the expected values from + // the article. + assert.equal( + title, + `cron.weekly issue #130: Github, keycloak, proc, redis6, cron & more` + ); + }); + + it('returns the author', async () => { + // To pass this test, fill out the author selector + // in ./src/extractors/custom/ma.ttias.be/index.js. + const { author } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(author, 'Mattias Geniar'); + }); + + it('returns the date_published', async () => { + // To pass this test, fill out the date_published selector + // in ./src/extractors/custom/ma.ttias.be/index.js. + const { date_published } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(date_published, `2020-04-19T05:50:00.000Z`); + }); + + it('returns the dek', async () => { + // To pass this test, fill out the dek selector + // in ./src/extractors/custom/ma.ttias.be/index.js. + const { dek } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(dek, null); + }); + + it('returns the lead_image_url', async () => { + // To pass this test, fill out the lead_image_url selector + // in ./src/extractors/custom/ma.ttias.be/index.js. + const { lead_image_url } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(lead_image_url, null); + }); + + it('returns the content', async () => { + const html = fs.readFileSync('./fixtures/ma.ttias.be/1587659928239.html'); + const uri = 'https://ma.ttias.be/cronweekly/issue-130/'; + + const { content } = await Mercury.parse(uri, { html }); + + const $ = cheerio.load(content || ''); + + // Ensure that there are 3 h2 elements. + const h2 = $('h2'); + + assert.equal(h2.length, 3); + + // Ensure that there are h2 h2 elements. + const h3 = $('h3'); + + assert.equal(h3.length, 27); + + // Ensure that there is 1 ul element. + const ul = $('ul'); + + assert.equal(ul.length, 1); + + // Ensure that there are no nav elements. + const nav = $('nav'); + + assert.equal(nav.length, 0); + + // Check the first 13 words. + + const first13 = excerptContent( + $('*') + .first() + .text(), + 13 + ); + + assert.equal( + first13, + 'Hi everyone! 👋 Welcome to cron.weekly issue #130. There’s quite a bit of' + ); + }); + }); +});