mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
notes, cleanup
This commit is contained in:
parent
752331eaae
commit
11a2286659
84
NOTES.md
Normal file
84
NOTES.md
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
Each extractor should ultimately be an object that exports like so:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import GenericContentExtractor from './content/extractor'
|
||||||
|
import GenericTitleExtractor from './title/extractor'
|
||||||
|
import GenericAuthorExtractor from './author/extractor'
|
||||||
|
import GenericDatePublishedExtractor from './date-published/extractor'
|
||||||
|
import GenericDekExtractor from './dek/extractor'
|
||||||
|
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
|
||||||
|
|
||||||
|
const GenericExtractor = {
|
||||||
|
content: GenericContentExtractor,
|
||||||
|
title: GenericTitleExtractor,
|
||||||
|
author: GenericAuthorExtractor,
|
||||||
|
datePublished: GenericDatePublishedExtractor,
|
||||||
|
dek: GenericDekExtractor,
|
||||||
|
leadImageUrl: GenericLeadImageUrlExtractor,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Custom parsers can then be merged with the generic parser to fill in gaps in their implementations. E.g:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import NYMagContentExtractor from '...'
|
||||||
|
import NYMagTitleExtractor from '...'
|
||||||
|
|
||||||
|
const NYMagExtractor = {
|
||||||
|
content: NYMagContentExtractor,
|
||||||
|
title: NYMagTitleExtractor,
|
||||||
|
}
|
||||||
|
|
||||||
|
const Extractor = {
|
||||||
|
...GenericExtractor,
|
||||||
|
...NYMagExtractor
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
# Declarative Custom Extractors
|
||||||
|
|
||||||
|
My goal is be to create declarative extractors that describe what rather than how. So, for example:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
NYMagExtractor = {
|
||||||
|
content: {
|
||||||
|
// Order by most likely. Extractor will stop on first occurence
|
||||||
|
selectors: [
|
||||||
|
'div.article-content',
|
||||||
|
'section.body',
|
||||||
|
'article.article',
|
||||||
|
],
|
||||||
|
|
||||||
|
// Selectors to remove from the extracted content
|
||||||
|
clean: [
|
||||||
|
'.ad',
|
||||||
|
],
|
||||||
|
|
||||||
|
// Array of tranformations to make on matched elements
|
||||||
|
// Each item in the array is an object. They key is the
|
||||||
|
// selector, the value is a tranformation function
|
||||||
|
// for the matching node.
|
||||||
|
transforms: [
|
||||||
|
// Convert h1s to h2s
|
||||||
|
{
|
||||||
|
'h1': ($node) => convertNodeTo($node, $, 'h2')
|
||||||
|
},
|
||||||
|
|
||||||
|
// Convert lazy-loaded noscript images to figures
|
||||||
|
{
|
||||||
|
'noscript': ($node) => {
|
||||||
|
const $children = $node.children()
|
||||||
|
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||||
|
convertNodeTo($node, $, 'figure')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
title: [
|
||||||
|
'h1',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
6
TODO.md
6
TODO.md
@ -1,6 +1,7 @@
|
|||||||
TODO:
|
TODO:
|
||||||
Tmrw:
|
- Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
|
||||||
- extractNextPageUrl
|
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
|
||||||
|
- extractNextPageUrl
|
||||||
- Try Closure webpack compiler
|
- Try Closure webpack compiler
|
||||||
- Rename all cleaners from cleanThing to clean
|
- Rename all cleaners from cleanThing to clean
|
||||||
- Make sure weightNodes flag is being passed properly
|
- Make sure weightNodes flag is being passed properly
|
||||||
@ -10,7 +11,6 @@ TODO:
|
|||||||
- Test if .is method is faster than regex methods
|
- Test if .is method is faster than regex methods
|
||||||
- Separate constants into activity-specific folders (dom, scoring)
|
- Separate constants into activity-specific folders (dom, scoring)
|
||||||
|
|
||||||
|
|
||||||
DONE:
|
DONE:
|
||||||
x extractLeadImageUrl
|
x extractLeadImageUrl
|
||||||
x extractDek
|
x extractDek
|
||||||
|
@ -20,7 +20,7 @@ const GenericExtractor = {
|
|||||||
|
|
||||||
// Cached value of every meta name in our document.
|
// Cached value of every meta name in our document.
|
||||||
// Used when extracting title/author/date_published/dek
|
// Used when extracting title/author/date_published/dek
|
||||||
const metaCache = $('meta').map((index, node) => {
|
const metaCache = $('meta').map((_, node) => {
|
||||||
return $(node).attr('name')
|
return $(node).attr('name')
|
||||||
}).toArray()
|
}).toArray()
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ describe('GenericExtractor', () => {
|
|||||||
'California appears poised to be first to ban power-guzzling big-screen TVs'
|
'California appears poised to be first to ban power-guzzling big-screen TVs'
|
||||||
)
|
)
|
||||||
assert.equal(
|
assert.equal(
|
||||||
datePublished.toISOString(),
|
datePublished,
|
||||||
'2009-10-14T04:00:00.000Z'
|
'2009-10-14T04:00:00.000Z'
|
||||||
)
|
)
|
||||||
assert.equal(dek, null)
|
assert.equal(dek, null)
|
||||||
|
Loading…
Reference in New Issue
Block a user