pull/736/merge
Sarah Doire 1 year ago committed by GitHub
commit 290a60ca0d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -8342,7 +8342,6 @@ function scaffoldCustomParser(url) {
if (!fs.existsSync(dir)) {
newParser = true;
confirmCreateDir(dir, "Creating ".concat(hostname, " directory"));
confirmCreateDir("./fixtures/".concat(hostname), 'Creating fixtures directory');
}
confirm(mercury.fetchResource, [url], 'Fetching fixture', newParser);
@ -8379,8 +8378,7 @@ function savePage($, _ref, newParser) {
hostname = _URL$parse5.hostname;
spinner.succeed();
var filename = new Date().getTime();
var file = "./fixtures/".concat(hostname, "/").concat(filename, ".html"); // fix http(s) relative links:
var file = "./fixtures/".concat(hostname, ".html"); // fix http(s) relative links:
makeLinksAbsolute$$1($('*').first(), $, url);
$('[src], [href]').each(function (index, node) {

File diff suppressed because one or more lines are too long

12
dist/mercury.js vendored

@ -7846,6 +7846,12 @@ var RootExtractor = {
};
}
var extendedResults = {};
if (extractor.extend) {
extendedResults = selectExtendedTypes(extractor.extend, opts);
}
var title = extractResult(_objectSpread({}, opts, {
type: 'title'
}));
@ -7894,12 +7900,6 @@ var RootExtractor = {
url = _ref3.url,
domain = _ref3.domain;
var extendedResults = {};
if (extractor.extend) {
extendedResults = selectExtendedTypes(extractor.extend, opts);
}
return _objectSpread({
title: title,
content: content,

File diff suppressed because one or more lines are too long

@ -1,5 +0,0 @@
#!/bin/bash
echo $3
find $3 -exec sed -i '' "s%$1%$2%g" '{}' \;

@ -61,7 +61,6 @@ function scaffoldCustomParser(url) {
if (!fs.existsSync(dir)) {
newParser = true;
confirmCreateDir(dir, `Creating ${hostname} directory`);
confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
}
confirm(Parser.fetchResource, [url], 'Fetching fixture', newParser);
@ -99,8 +98,7 @@ function savePage($, [url], newParser) {
spinner.succeed();
const filename = new Date().getTime();
const file = `./fixtures/${hostname}/${filename}.html`;
const file = `./fixtures/${hostname}.html`;
// fix http(s) relative links:
makeLinksAbsolute($('*').first(), $, url);
$('[src], [href]').each((index, node) => {

@ -1,184 +1,81 @@
/* eslint-disable */
const { execFile, execFileSync } = require('child_process');
const fs = require('fs');
const fsPromises = require('fs/promises');
const path = require('path');
const URL = require('url');
const octokit = require('@octokit/rest')();
const Parser = require('../dist/mercury');
// get all fixtures
execFile('find', ['fixtures', '-type', 'f'], (err, stdout) => {
const fixtures = stdout.split('\n');
const now = new Date();
const twoWeeks = 2 * 7 * 24 * 60 * 60 * 1000;
// iterate through fixtures for fixtures older than 2 weeks
console.log('Finding fixtures to update...');
const fixturesToUpdate = fixtures
.filter(fixture => {
const timestamp = path
.basename(fixture)
.split(/\.html$/)[0]
.trim();
try {
const date = new Date(parseInt(timestamp, 10));
return now - date > twoWeeks;
} catch (e) {
// if fixture isn't a timestamp, ignore it
return false;
}
const FIXTURES_PATH = path.join(__dirname, '..', 'fixtures');
const perform = async () => {
const fixtures = (await fsPromises.readdir(FIXTURES_PATH)).filter(f =>
f.match(/\.html$/)
);
const TODAY = new Date();
const TWO_WEEKS_AGO = new Date(TODAY.setDate(TODAY.getDate() - 14));
console.log('Finding fixtures to update…');
const fixturesToUpdate = (await Promise.all(
fixtures.map(async filename => {
const stats = await fsPromises.stat(path.join(FIXTURES_PATH, filename));
return [filename, stats.mtime];
})
.slice(0, 1);
))
.filter(([_filename, timestamp]) => timestamp <= TWO_WEEKS_AGO)
.map(([filename, _timestamp]) => filename);
console.log(`${fixturesToUpdate.length} fixtures are out of date`);
// iterate through fixtures and extract their URLs.
console.log('Extracting urls...');
const baseDomains = fixturesToUpdate.map(fixture => fixture.split('/')[1]);
Promise.all(
fixturesToUpdate.map((fixture, i) => {
const html = fs.readFileSync(fixture);
return Parser.parse(`http://${baseDomains[i]}`, { html });
})
).then(parsedFixture => {
const fixturesAndUrls = fixturesToUpdate.reduce(
(acc, fixture, i) =>
acc.concat({
fixture,
url: parsedFixture[i].url,
baseDomain: baseDomains[i],
}),
[]
);
console.log('Updating all fixtures');
const fns = fixturesAndUrls
.map(fixtureAndUrl => {
return () => {
// console.log('Updating fixture for', fixtureAndUrl);
return updateFixture(fixtureAndUrl);
};
})
.concat(() => {
return new Promise(res => {
console.log('changed bases', changeBase);
console.log(`otherMess`, otherMess);
res();
});
});
promiseSerial(fns);
});
});
const changeBase = [];
const otherMess = [];
const updateFixture = ({ fixture, url, baseDomain }) => {
return new Promise(res => {
Parser.parse(url)
.then(({ url: updatedUrl }) => {
if (!updatedUrl) {
otherMess.push({ updatedUrl, url, fixture, baseDomain });
return res();
}
console.log(`updatedUrl`, updatedUrl);
const { hostname } = URL.parse(updatedUrl);
if (hostname !== baseDomain) {
console.log('Base URL has changed!!! Do something different');
console.log(`url`, url);
console.log(`updatedUrl`, updatedUrl);
console.log(`hostname`, hostname);
changeBase.push({
fixture,
url,
baseDomain,
newBaseDomain: hostname,
updatedUrl,
});
return res();
}
execFile('yarn', ['generate-parser', url], (err, stdout) => {
// console.log(`stdout`, stdout);
const dirRe = new RegExp(`(${path.dirname(fixture)}\/\\d+\.html)`);
const newFixture = stdout.match(dirRe)[0];
console.log(`newFixture`, newFixture);
// replace old fixture with new fixture in tests
execFile(
'./scripts/find-and-replace.sh',
[fixture, newFixture, 'src/extractors/custom/**/*.test.js'],
(err, stdout) => {
// remove old fixture
fs.unlinkSync(fixture);
const { branchName, commitMessage } = doTestsPass(baseDomain)
? {
branchName: `chore-update-${baseDomain}-fixture`,
commitMessage: `chore: update ${baseDomain} fixture`,
}
: {
branchName: `fix-update-${baseDomain}-extractor`,
commitMessage: `fix: update ${baseDomain} extractor`,
};
createAndPushBranch({ branchName, commitMessage });
createPR({ branchName, title: commitMessage });
}
);
const changeBase = [];
const otherMess = [];
console.log('Updating all fixtures');
for (const filename of fixturesToUpdate) {
const fixturePath = path.join(FIXTURES_PATH, filename);
const baseDomain = filename.replace(/(?:--[a-z-]+)?\.html$/, '');
const oldHtml = await fsPromises.readFile(fixturePath);
const { url } = await Parser.parse(`http://${baseDomain}`, {
html: oldHtml,
});
console.log(`Updating fixture for ${baseDomain} (${url})`);
try {
const { url: updatedUrl } = await Parser.parse(url);
if (!updatedUrl) {
otherMess.push({ updatedUrl, url, filename, baseDomain });
continue;
}
const { hostname } = URL.parse(updatedUrl);
if (hostname !== baseDomain) {
console.log(
`Base URL has changed from ${baseDomain} to ${hostname}, passing`
);
changeBase.push({
filename,
url,
baseDomain,
newBaseDomain: hostname,
updatedUrl,
});
})
.catch(e => {
otherMess.push({ fixture, url, baseDomain, e });
});
});
};
const doTestsPass = site => {
try {
execFileSync('yarn', ['test:node', site]);
return true;
} catch (e) {
return false;
}
};
continue;
}
const promiseSerial = funcs =>
funcs.reduce(
(promise, func) =>
promise.then(result => func().then(Array.prototype.concat.bind(result))),
Promise.resolve([])
);
const $ = await Parser.fetchResource(updatedUrl);
const newHtml = $.html();
const createAndPushBranch = ({ branchName, commitMessage }) => {
execFileSync('git', [
'config',
'user.email',
'adam.pash+postlight-bot@postlight.com',
]);
execFileSync('git', ['config', 'user.name', 'Postlight Bot']);
execFileSync('git', ['checkout', '-b', branchName]);
execFileSync('git', ['add', '.']);
execFileSync('git', ['commit', '-m', commitMessage]);
execFileSync('git', [
'push',
'-q',
`https://${process.env.GH_AUTH_TOKEN}@github.com/postlight/parser.git`,
]);
};
await fsPromises.writeFile(fixturePath, newHtml);
} catch (e) {
console.log('Fixture update failed to parse', e);
}
}
const createPR = ({ branchName, title, body = '' }) => {
octokit.authenticate({
type: 'token',
token: process.env.GH_AUTH_TOKEN,
});
octokit.pulls.create({
owner: 'postlight',
repo: 'parser',
title,
head: branchName,
base: 'master',
body,
maintainer_can_modify: true,
});
console.log('changed bases', changeBase);
console.log('other mess', otherMess);
};
perform();

Loading…
Cancel
Save