feat: arstechnica.com extractor (#553)

* feat:Add a custom extractor for ma.ttias.be.

When parsing content for cron.weekly issues, such as the one at https://ma.ttias.be/cronweekly/issue-130/, Mercury Parser would remove headings and ordered lists that were part of the content. This resolves that as follows:

* Remove "id" attributes from "h1" and "h2" elements. Those attributes would result in the elements having a low weight.
* Since Mercury Parser demotes "h1" elements to "h2", demote "h2" elements to "h3".
* Add class="entry-content-asset" to "ul" elements to avoid them being removed.

* removed redundant comment.

* feat: Add a custom extractor for engadget.com.

* Works, but I need to figure how to make pagination work correctly.

* fixed pagination - would only retrieve first or second page because we would send contentOnly: true on subsequent pages (page 2).
removed failover: true from preview.

* rolled back { fallback: false } option removal

* Clarified comments.

Co-authored-by: John Holdun <john@johnholdun.com>
pull/554/head^2
John Brayton 2 years ago committed by GitHub
parent 3c5c0bdba9
commit 143631b4b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,525 @@
<!DOCTYPE html>
<html lang="en-us">
<head>
<title>The connected renter: How to make your apartment smarter | Ars Technica</title>
<link rel="stylesheet" type="text/css" media="all" href="https://cdn.arstechnica.net/wp-content/themes/ars/assets/css/main-ad39ba78fc.css">
<link rel="alternate" type="application/rss+xml" href="http://feeds.arstechnica.com/arstechnica/index/">
<link rel="shortcut icon" href="https://cdn.arstechnica.net/favicon.ico">
<link rel="icon" type="image/x-icon" href="https://cdn.arstechnica.net/favicon.ico">
<link rel="apple-touch-icon" sizes="180x180" href="https://cdn.arstechnica.net/wp-content/themes/ars/assets/img/ars-ios-icon-d9a45f558c.png">
<link rel="mask-icon" href="https://cdn.arstechnica.net/wp-content/themes/ars/assets/img/ars-macos-safari-8997f76b21.svg" color="#ff4e00">
<link rel="icon" sizes="192x192" href="https://cdn.arstechnica.net/wp-content/themes/ars/assets/img/material-ars-db41652381.png">
<meta name="application-name" value="Ars Technica">
<meta name="msapplication-starturl" value="http://arstechnica.com/">
<meta name="msapplication-tooltip" value="Ars Technica: Serving the technologist for 1.2 decades">
<meta name="msapplication-task" value="name=News;action-uri=http://arstechnica.com/;icon-uri=https://cdn.arstechnica.net/favicon.ico">
<meta name="msapplication-task" value="name=Features;action-uri=http://arstechnica.com/features/;icon-uri=https://cdn.arstechnica.net/ie-jump-menu/jump-features.ico">
<meta name="msapplication-task" value="name=OpenForum;action-uri=http://arstechnica.com/civis/;icon-uri=https://cdn.arstechnica.net/ie-jump-menu/jump-forum.ico">
<meta name="msapplication-task" value="name=Subscribe;action-uri=http://arstechnica.com/subscriptions/;icon-uri=https://cdn.arstechnica.net/ie-jump-menu/jump-subscribe.ico">
<meta http-equiv="Content-Type" value="text/html; charset=utf-8">
<meta name="advertising" value="ask">
<meta value="592156917" name="fb:admins">
<meta value="108943" name="fb:admins">
<meta value="19374573752" name="fb:pages">
<meta name="format-detection" value="telephone=no">
<meta name="theme-color" value="#000000">
<meta name="viewport" value="width=device-width,initial-scale=1">
<meta name="parsely-page" value="{&quot;title&quot;:&quot;The connected renter: How to make your apartment smarter&quot;,&quot;link&quot;:&quot;https:\/\/arstechnica.com\/gadgets\/2016\/08\/the-connected-renter-how-to-make-your-apartment-smarter\/&quot;,&quot;type&quot;:&quot;post&quot;,&quot;author&quot;:&quot;Valentina Palladino&quot;,&quot;post_id&quot;:921807,&quot;pub_date&quot;:&quot;2016-08-10T11:15:53Z&quot;,&quot;section&quot;:&quot;Tech&quot;,&quot;tags&quot;:[&quot;gadgetology&quot;,&quot;iot&quot;,&quot;smart-apartments&quot;,&quot;smart-home&quot;,&quot;type: feature&quot;],&quot;image_url&quot;:&quot;https:\/\/cdn.arstechnica.net\/wp-content\/uploads\/2016\/07\/smartapartment_hero5-150x150.jpg&quot;}">
<meta name="parsely-metadata" value="{&quot;type&quot;:&quot;feature&quot;,&quot;title&quot;:&quot;The connected renter: How to make your apartment smarter&quot;,&quot;post_id&quot;:921807,&quot;lower_deck&quot;:&quot;Turning your rented space into a smart home can be tricky; we have some advice.&quot;,&quot;image_url&quot;:&quot;https:\/\/cdn.arstechnica.net\/wp-content\/uploads\/2016\/07\/smartapartment_hero5-150x150.jpg&quot;,&quot;listing_image_url&quot;:&quot;https:\/\/cdn.arstechnica.net\/wp-content\/uploads\/2016\/07\/smartapartment_hero5-300x150.jpg&quot;}">
<link rel="canonical" href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/">
<link rel="amphtml" href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/?amp=1">
<link rel="shorturl" href="https://arstechnica.com/?p=921807">
<meta name="description" value="Turning your rented space into a smart home can be tricky; we have some advice.">
<meta name="twitter:card" value="summary_large_image">
<meta name="twitter:url" value="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/">
<meta name="twitter:title" value="The connected renter: How to make your apartment smarter">
<meta name="twitter:description" value="Turning your rented space into a smart home can be tricky; we have some advice.">
<meta name="twitter:site" value="@arstechnica">
<meta name="twitter:domain" value="arstechnica.com">
<meta value="Ars Technica" name="og:site_name">
<meta name="twitter:image:src" value="https://cdn.arstechnica.net/wp-content/uploads/2016/07/smartapartment_hero5-640x215.jpg">
<meta name="twitter:image:width" value="640">
<meta name="twitter:image:height" value="215">
<meta name="twitter:creator" value="@valentinalucia">
<meta value="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/" name="og:url">
<meta value="The connected renter: How to make your apartment smarter" name="og:title">
<meta value="https://cdn.arstechnica.net/wp-content/uploads/2016/07/smartapartment_hero5-640x215.jpg" name="og:image">
<meta value="Turning your rented space into a smart home can be tricky; we have some advice." name="og:description">
<meta value="article" name="og:type">
<link rel="dns-prefetch" href="https://aax.amazon-adsystem.com/">
<link rel="preconnect" href="https://aax.amazon-adsystem.com/" crossorigin="">
<link rel="preconnect" href="https://mb.moatads.com/" crossorigin="">
</head>
<body class="post-template-default single single-post postid-921807 single-format-standard grid-view light fullwidth blog-us">
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-NLXNPCQ" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<aside class="ad ad_crown" aria-label="Top of page advertisement"></aside>
<div class="site-wrapper">
<a class="screen-reader-text skip-link" href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/#main" aria-label="Skip to main content">Skip to main content</a>
<header class="site-header">
<div class="header-left">
<a href="https://arstechnica.com/" id="header-logo" title="Ars Technica Homepage">
<span class="icon icon-logo-ars-us"></span>
</a>
</div>
<div class="header-right">
<nav id="header-nav-primary">
<ul>
<li><a class="nav-link section-information-technology " href="https://arstechnica.com/information-technology/">Biz &amp; IT</a></li>
<li><a class="nav-link section-gadgets active" href="https://arstechnica.com/gadgets/">Tech</a></li>
<li><a class="nav-link section-science " href="https://arstechnica.com/science/">Science</a></li>
<li><a class="nav-link section-tech-policy " href="https://arstechnica.com/tech-policy/">Policy</a></li>
<li><a class="nav-link section-cars " href="https://arstechnica.com/cars/">Cars</a></li>
<li><a class="nav-link section-gaming " href="https://arstechnica.com/gaming/">Gaming &amp; Culture</a></li>
<li><a class="nav-link store" href="https://arstechnica.com/store/">Store</a></li>
<li><a class="nav-link forums" href="https://arstechnica.com/civis/">Forums</a></li>
</ul>
</nav>
<a href="https://arstechnica.com/store/product/subscriptions/" class="header-highlight-link">Subscribe</a>
<div class="dropdown" id="header-search">
<a href="https://arstechnica.com/search/" class="dropdown-toggle search-toggle" aria-label="Search" aria-expanded="false">
<span class="icon icon-search-mag-glass"></span>
</a>
<div class="dropdown-content">
<a class="nav-search-close">Close</a>
</div>
</div>
<div class="dropdown dropdown-mega" id="header-burger">
<a href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/#site-menu" class="dropdown-toggle" aria-label="Menu" aria-expanded="false">
<span></span>
</a>
<div id="site-menu" class="dropdown-content">
<section class="burger-navigate">
<h3>
<span class="icon icon-half-target"></span>
Navigate
</h3>
<ul>
<li><a class="nav-link store" href="https://arstechnica.com/store/">Store</a></li>
<li><a class="nav-link subscribe" href="https://arstechnica.com/store/product/subscriptions/">Subscribe</a></li>
<li><a class="nav-link videos" href="http://video.arstechnica.com/">Videos</a></li>
<li><a class="nav-link section-features" href="https://arstechnica.com/features/">Features</a></li>
<li><a class="nav-link section-reviews" href="https://arstechnica.com/reviews/">Reviews</a></li>
</ul>
<ul>
<li><a class="nav-link page-rss-feeds" href="https://arstechnica.com/rss-feeds/">RSS Feeds</a></li>
<li><a class="nav-link mobile" href="https://arstechnica.com/?view=mobile">Mobile Site</a></li>
</ul>
<ul>
<li><a class="nav-link page-about-us" href="https://arstechnica.com/about-us/">About Ars</a></li>
<li><a class="nav-link page-staff-directory" href="https://arstechnica.com/staff-directory/">Staff Directory</a></li>
<li><a class="nav-link page-contact-us" href="https://arstechnica.com/contact-us/">Contact Us</a></li>
</ul>
<ul>
<li><a class="nav-link page-advertise-with-us" href="https://arstechnica.com/advertise-with-us/">Advertise with Ars</a></li>
<li><a class="nav-link page-reprints" href="https://arstechnica.com/reprints/">Reprints</a></li>
</ul>
</section>
<section class="burger-filter">
<h3>
<span class="icon icon-half-mag"></span>
Filter by topic
</h3>
<ul id="burger-nav-primary">
<li><a class="nav-link section-information-technology " href="https://arstechnica.com/information-technology/">Biz &amp; IT</a></li>
<li><a class="nav-link section-gadgets active" href="https://arstechnica.com/gadgets/">Tech</a></li>
<li><a class="nav-link section-science " href="https://arstechnica.com/science/">Science</a></li>
<li><a class="nav-link section-tech-policy " href="https://arstechnica.com/tech-policy/">Policy</a></li>
<li><a class="nav-link section-cars " href="https://arstechnica.com/cars/">Cars</a></li>
<li><a class="nav-link section-gaming " href="https://arstechnica.com/gaming/">Gaming &amp; Culture</a></li>
<li><a class="nav-link store" href="https://arstechnica.com/store/">Store</a></li>
<li><a class="nav-link forums" href="https://arstechnica.com/civis/">Forums</a></li>
</ul>
</section>
<section class="burger-settings">
<h3>
<span class="icon icon-half-gear"></span>
Settings
</h3>
<div>
<div class="burger-layout">
<p>Front page layout</p>
<div class="burger-layout-grid">
<a rel="nofollow" href="http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/?view=grid" class="">
<span class="icon icon-grid"></span><br>
Grid
<div class="faux-radio active"></div>
</a>
</div>
<div class="burger-layout-list">
<a rel="nofollow" href="http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/?view=archive" class="">
<span class="icon icon-list"></span><br>
List
<div class="faux-radio "></div>
</a>
</div>
</div>
<div class="burger-theme">
<p>Site theme</p>
<div class="burger-theme-light">
<a rel="nofollow" href="http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/?theme=light" class="">
<span><span>Black on white</span></span>
<div class="faux-radio active"></div>
</a>
</div>
<div class="burger-theme-dark">
<a rel="nofollow" href="http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/?theme=dark" class="">
<span><span>White on black</span></span>
<div class="faux-radio "></div>
</a>
</div>
</div>
</div>
</section>
</div>
</div>
<div class="dropdown dropdown-mega" id="header-account">
<a href="https://arstechnica.com/civis/ucp.php?mode=login&amp;return_to=%2Fgadgets%2F2016%2F08%2Fthe-connected-renter-how-to-make-your-apartment-smarter%2F" class="dropdown-toggle" aria-expanded="false">
Sign in
<span class="icon dropdown-indicator icon-drop-indicator"></span>
</a>
<div class="dropdown-content">
<section class="profile-activity">
<h3>
<span class="icon icon-half-bubble-up"></span>
Comment activity
</h3>
<p>Sign up or login to join the discussions!</p>
</section>
<section class="profile-settings">
<div class="register-account">
<span>Sign up to comment and more</span>
<a href="https://arstechnica.com/civis/ucp.php?mode=register" class="signup-btn button button-wide">Sign up</a>
</div>
</section>
</div>
</div>
</div>
</header>
<main id="main" class="content-wrapper">
<article itemscope="" itemtype="http://schema.org/NewsArticle" class="article-single standalone intro-standard " id="">
<div class="column-wrapper">
<div class="left-column">
<header class="article-header">
<h4 class="post-upperdek">
Tech &#x2014;
</h4>
<h1 itemprop="headline">The connected renter: How to make your apartment smarter</h1>
<h2 itemprop="description">Turning your rented space into a smart home can be tricky; we have some advice.</h2>
<section class="post-meta">
<p class="byline" itemprop="author creator" itemscope="" itemtype="http://schema.org/Person">
<a itemprop="url" href="https://arstechnica.com/author/valentina/" rel="author"><span itemprop="name">Valentina Palladino</span></a>
- <time class="date" data-time="1470827753" datetime="2016-08-10T11:15:53+00:00">Aug 10, 2016 11:15 am UTC</time>
</p>
</section> </header>
<section class="article-guts">
<div itemprop="articleBody" class="article-content post-page">
<figure class="intro-image full">
<img src="https://cdn.arstechnica.net/wp-content/uploads/2016/07/smartapartment_hero5-980x713.jpg" alt="The connected renter: How to make your apartment smarter">
<figcaption class="caption"><div class="caption-credit">Valentina Palladino</div></figcaption> </figure>
<aside id="social-left" class="social-left" aria-label="Read the comments or share this article">
<a title="73 posters participating" class="comment-count icon-comment-bubble-down" href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/?comments=1">
<h4 class="comment-count-before">reader comments</h4>
<span class="comment-count-number">132</span>
<span class="visually-hidden"> with 73 posters participating</span>
</a>
<div class="share-links">
<h4>Share this story</h4>
<ul>
<li><a href="https://www.facebook.com/sharer.php?u=https%3A%2F%2Farstechnica.com%2F%3Fpost_type%3Dpost%26p%3D921807" target="_blank" class="social-icon share-facebook" title="Share on Facebook"><span class="visually-hidden">Share on Facebook</span></a></li>
<li><a href="https://twitter.com/share?text=The+connected+renter%3A+How+to+make+your+apartment+smarter&amp;url=https%3A%2F%2Farstechnica.com%2F%3Fpost_type%3Dpost%26p%3D921807" target="_blank" class="social-icon share-twitter" title="Share on Twitter"><span class="visually-hidden">Share on Twitter</span></a></li>
<li><a href="https://www.reddit.com/submit?url=https%3A%2F%2Farstechnica.com%2F%3Fpost_type%3Dpost%26p%3D921807&amp;title=The+connected+renter%3A+How+to+make+your+apartment+smarter" target="_blank" class="social-icon share-reddit" title="Share on Reddit"><span class="visually-hidden">Share on Reddit</span></a></li>
</ul>
</div>
</aside>
<p class="p1">
</p><p class="p1">Name a home appliance or product, and there&apos;s probably a smart version of it today. But for the renters among us, it can be tricky to navigate the aisles of smart light bulbs, thermostats, air conditioners, and vacuums to pick out devices that won&apos;t jeopardize your security deposit. When you don&apos;t own your home, there&#x2019;s a different set of rules dictating modifications, and some smart home products don&apos;t take that into account.</p>
<p class="p1">Luckily these days, an increasing number of smart home devices can cater to apartment dwellers that want to avoid ripping open walls and trussing up wires. And as a NYC-based Ars staffer, I had a particularly perfect rental laboratory to recently test and explore&#xA0;what kinds of smart home devices fit renters&apos; needs.</p>
<h2>What protocols should/can you use?</h2>
<p class="p1">When picking out accessories, for convenience&apos;s sake you&apos;ll probably want to decide ahead of time which of the many competing smart home ecosystems you&apos;d like those products to come from. If you want to control all of your smart home products from the same app or want every product to be able to &quot;talk&quot; to each other, brand or protocol lock-in is the easiest current approach.</p>
<h3>HomeKit</h3>
<p class="p1">Let&apos;s start with the one everyone has heard of: HomeKit. Apple&apos;s IoT framework lets you control HomeKit-compatible devices via your iPhone or iPad. This framework tries to make it easier for users to control all smart home products by allowing the devices to talk to each other over a home&apos;s Wi-Fi network. With HomeKit, you can use Siri voice commands to control smart products&#x2014;with iOS 10 in fact, an official &quot;Home&quot; app will make it easier to control HomeKit devices and group them together. After setting devices up, you can control them using simple phrases like, &quot;Siri, turn off the living room lights.&quot; You can also make nuanced commands that control only certain devices. For example, the voice command &quot;Good morning&quot; could initiate a group of actions including turning on the bedroom and bathroom lights and opening the blinds.</p>
<p class="p1">The thing to remember about HomeKit is that manufacturers must integrate compatible hardware into their products under the terms of <a href="https://developer.apple.com/homekit/"> Apple&apos;s MFi licensing program </a>. For example, a year or so ago Philips <a href="http://arstechnica.com/gadgets/2015/10/new-philips-hue-bridge-works-with-homekit-lets-siri-control-lights/"> came out with a HomeKit-enabled Hue bridge </a> for its smart lighting systems. The Philips Hue bridge already existed, but the company needed to make a new one that worked specifically with HomeKit. So far, companies including Belkin, Honeywell, and iHome among others have made HomeKit-ready devices. But still, the overall number of HomeKit-compatible devices is dwarfed by the number of smart home products available. HomeKit is also only available on Apple devices running iOS 8.2 or later, and watchOS 2 allows you to control devices via your Apple Watch.</p>
<p class="p1"><figure class="video"><div class="wrapper" style="padding-bottom: 56.122448979592%;"><div class="ars-video-container" data-video-id="5722396833055407d4000003" data-video-params="[]" style="height:550px; width:980px"></div></div><figcaption class="caption"><div class="caption-text">Video shot/edited by Jennifer Hahn.</div></figcaption></figure></p>
<h3>Alexa</h3>
<p class="p1">Amazon&apos;s Alexa voice assistant is tied to Amazon&apos;s products, namely the Echo, and you can ask it questions about basically anything. Along the same lines as Siri, you can also control some smart home products by asking Alexa to turn things on and off. Through the Alexa mobile app, users can create batch commands similar to those available through Siri, and this option makes it possible to control entire rooms filled with devices. Unlike with Siri and HomeKit, though, manufacturers can create Alexa compatibility with software updates rather than implementing entirely new hardware&#x2014;that&apos;s why companies like Nest and Haiku have been able to add Alexa support to their smart home products.<div class="pullbox sidebar story-sidebar right"><div class="story-sidebar-part"><a href="https://arstechnica.com/gadgets/2016/05/one-year-after-alexa-amazons-echo-has-found-a-small-but-smart-niche/" class="recommendation-further-reading story-sidebar-part-img" style="background-image:url(&apos;https://cdn.arstechnica.net/wp-content/uploads/2016/04/echalexa1-300x150.jpg&apos;);" tabindex="-1" role="presentation" aria-hidden="true"></a><div class="story-sidebar-part-content"><h3>Further Reading</h3><a class="recommendation-further-reading" href="https://arstechnica.com/gadgets/2016/05/one-year-after-alexa-amazons-echo-has-found-a-small-but-smart-niche/">One year after Alexa: Amazon&#x2019;s Echo has found a small but smart niche</a></div></div></div></p>
<p class="p1">The convenience of HomeKit and Alexa is that you can control all of your devices using your voice&#x2014;Alexa and Siri handle the communication from there. Otherwise, most smart home devices instead rely on dedicated apps that act like remote controls. HomeKit has the added benefit that devices can &quot;talk&quot; to one another, much like compatible devices under the Works with Nest framework. If, for example, you leave your home and lock your smart door lock behind you, that action could trigger turning off the lights inside a home.</p>
<h3>Zigbee and Z-Wave</h3>
<p class="p1">While HomeKit and Alexa have a lot of growing to do in terms of compatibility, Zigbee and Z-Wave have already linked to hundreds of working smart home devices. Both are mesh networking systems that send information between all devices on a network bound together by a hub. Zigbee runs on the universal 2.4GHz ISM frequency band, meaning its products are country-agnostic. By contrast, Z-Wave runs on the 915 MHz ISM band in the United States and the 868 MHz RFID band in the UK.</p>
<p class="p1">Both of these protocols, especially Zigbee, are suited for those who want to fiddle around with their devices and set unique commands and controls. However, keep in mind that hubs and devices on Zigbee and Z-Wave tend to have shorter ranges, meaning you&apos;ll need to keep individual devices relatively close to each other. For apartment dwellers, that might actually work to your advantage if you have only a few small rooms to connect.</p>
<h3>Which products are off-limits?</h3>
<p class="p1"><div class="pullbox sidebar story-sidebar right"><div class="story-sidebar-part"><a href="https://arstechnica.com/gadgets/2012/08/a-thermostat-that-learns-three-months-with-the-nest/" class="recommendation-further-reading story-sidebar-part-img" style="background-image:url(&apos;https://cdn.arstechnica.net/wp-content/uploads/2012/07/Nest5-300x100.jpg&apos;);" tabindex="-1" role="presentation" aria-hidden="true"></a><div class="story-sidebar-part-content"><h3>Further Reading</h3><a class="recommendation-further-reading" href="https://arstechnica.com/gadgets/2012/08/a-thermostat-that-learns-three-months-with-the-nest/">A thermostat that learns? Three months with the Nest</a></div></div></div>The best way to know what can and can&#x2019;t work within your rental is decidedly analogue&#x2014;read your lease and any other renter&apos;s agreements you may have signed upon moving in. Check the official documentation to see what you have access to and what you don&apos;t. For example, my apartment came with a refrigerator, oven, and microwave, and my lease forbids me from installing any different ones without notifying the landlord&apos;s office first. I&apos;m also not allowed to have washer or dryer units in my apartment. You don&apos;t want to buy any smart home product only to realize its installation or very existence violates your lease agreements.</p>
<p class="p1">As a general rule of thumb, smart thermostats are typically off-limits for renters. These products require rewiring stuff that&apos;s hidden behind walls, after all. Another category you&apos;ll want to be careful with is smart locks. Many (not all&#x2014;we&apos;ll get to some examples) replace the locks on your doors, which renters typically cannot do without a specific reason or without informing management.</p>
<div id="action_button_container"></div>
</div>
<nav class="page-numbers">Page: <span class="numbers">1 <a href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2/">2</a> <a href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/3/">3</a> <a href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2/"><span class="next">Next <span class="arrow">&#x2192;</span></span></a></span></nav>
</section>
</div>
<div class="xrail">
<div class="xrail-content">
<aside class="ad ad_xrail ad_xrail_top" aria-label="Top sidebar advertisement"></aside>
<aside class="ad_native ad_native_xrail" aria-label="Sidebar native advertisement"></aside>
</div>
</div>
</div>
<div class="column-wrapper">
<div class="left-column">
<div id="social-footer">
<a title="73 posters participating" class="comment-count icon-comment-bubble-down" href="https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/?comments=1">
<h4 class="comment-count-before">reader comments</h4>
<span class="comment-count-number">132</span>
<span class="visually-hidden"> with 73 posters participating</span>
</a>
<div class="share-links">
<h4>Share this story</h4>
<ul>
<li><a href="https://www.facebook.com/sharer.php?u=https%3A%2F%2Farstechnica.com%2F%3Fpost_type%3Dpost%26p%3D921807" target="_blank" class="social-icon share-facebook" title="Share on Facebook"><span class="visually-hidden">Share on Facebook</span></a></li>
<li><a href="https://twitter.com/share?text=The+connected+renter%3A+How+to+make+your+apartment+smarter&amp;url=https%3A%2F%2Farstechnica.com%2F%3Fpost_type%3Dpost%26p%3D921807" target="_blank" class="social-icon share-twitter" title="Share on Twitter"><span class="visually-hidden">Share on Twitter</span></a></li>
<li><a href="https://www.reddit.com/submit?url=https%3A%2F%2Farstechnica.com%2F%3Fpost_type%3Dpost%26p%3D921807&amp;title=The+connected+renter%3A+How+to+make+your+apartment+smarter" target="_blank" class="social-icon share-reddit" title="Share on Reddit"><span class="visually-hidden">Share on Reddit</span></a></li>
</ul>
</div>
</div>
<section class="article-author">
<a style="background-image:url(&apos;https://cdn.arstechnica.net/wp-content/uploads/2016/05/v.palladino-45953.jpg&apos;);" class="author-photo" href="https://arstechnica.com/author/valentina" tabindex="-1" role="presentation" aria-hidden="true"></a>
<div class="author-bio">
<section class="author-bio-top">
<a href="https://arstechnica.com/author/valentina" class="author-name">Valentina Palladino</a>
Valentina reviews consumer electronics for Ars Technica, testing all kinds of gadgets with a focus on mobile devices and wearables. She has a soft spot for Chromebooks. </section>
<section class="author-social">
<strong>Twitter</strong> <a href="https://www.twitter.com/valentinalucia" target="_blank">@valentinalucia</a>
</section>
</div>
</section>
</div>
<div class="xrail"></div>
</div>
<div id="article-footer-wrap">
<aside class="ad ad_fullwidth fullwidth" aria-label="Full width advertisement"></aside>
<section id="comments-area" class="comments-area column-wrapper">
<div class="row comments-row left-column">
<a name="comments-bar"></a>
<div id="comments-container"></div>
<div id="comments-posting-container" class="thick-divide-bottom">
<p id="reply">You must <a href="https://arstechnica.com/civis/ucp.php?mode=login&amp;return_to/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/" class="vote_login">login or create an account</a> to comment.</p>
</div>
</div>
<div class="xrail xrail-comments">
<div class="xrail-content xrail-content-comments">
<aside class="ad ad_xrail ad_xrail_comments" aria-label="Comments sidebar advertisement"></aside>
</div>
</div>
</section>
<section class="inline-playlist">
<div class="ars-video-playlist">
<h3 class="ars-video-playlist-module-header">Channel <span>Ars Technica</span></h3>
<div class="ars-video-playlist-module" data-playlist-id="arstechnica-channel-ars-gadgets" data-video-options="[]"></div>
</div>
</section>
<div class="prev-next-links">
<a href="https://arstechnica.com/science/2016/08/access-to-healthcare-through-aca-may-actually-be-improving-americans-health/" rel="prev"><span class="arrow">&#x2190;</span> Previous story</a> <a href="https://arstechnica.com/tech-policy/2016/08/judge-blasts-dojs-refusal-to-explain-stingray-use-in-attempted-murder-case/" rel="next">Next story <span class="arrow">&#x2192;</span></a></div>
<footer id="article-footer">
<div id="recommendations-footer">
<div id="story-recommendations">
<div class="heading-column">
<h3>Related Stories</h3>
</div>
<ul id="story-recs" class="rec-wrap"></ul>
</div>
<div id="sponsored-recommendations">
<div class="heading-column">
<h3>Sponsored Stories</h3>
<a href="http://www.outbrain.com/what-is/default/en" target="_blank">Powered by <span class="icon outbrain-logo icon-logo-outbrain"></span></a>
</div>
<ul id="outbrain-recs"></ul>
</div>
<div id="latest-stories">
<div class="heading-column">
<h3>Today on Ars</h3>
</div>
<ul id="latest-recs" class="rec-wrap"></ul>
</div>
</div>
</footer>
</div>
</article>
</main>
<footer class="site-footer">
<nav class="nav-footer">
<section>
<ul>
<li><a href="https://arstechnica.com/store/">Store</a></li>
<li><a href="https://arstechnica.com/store/product/subscriptions/">Subscribe</a></li>
<li><a href="https://arstechnica.com/about-us/">About Us</a></li>
<li><a href="https://arstechnica.com/rss-feeds/">RSS Feeds</a></li>
<li><a rel="nofollow" href="http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/?view=mobile">View Mobile Site</a></li>
</ul>
</section>
<section>
<ul>
<li><a href="https://arstechnica.com/contact-us/">Contact Us</a></li>
<li><a href="https://arstechnica.com/staff-directory/">Staff</a></li>
<li><a href="https://arstechnica.com/advertise-with-us/">Advertise with us</a></li>
<li><a href="https://arstechnica.com/reprints/">Reprints</a></li>
</ul>
</section>
<section class="footer-newsletter">
<div class="newsletter-wrapper">
<h3>
<a href="https://arstechnica.com/newsletters/">Newsletter Signup</a>
</h3>
<p>Join the Ars Orbital Transmission mailing list to get weekly updates delivered to your inbox.</p>
<a class="button" href="https://arstechnica.com/newsletters/">Sign me up &#x2192;</a>
</div>
</section>
</nav>
<section class="footer-terms-logo">
<div class="cn-logo">
<a href="http://condenast.com/" class="icon icon-logo-cn-us" title="Visit Cond&#xE9; Nast"></a>
</div>
<p id="copyright-terms">
CNMN Collection<br>
WIRED Media Group<br>
&#xA9; 2020 Cond&#xE9; Nast. All rights reserved. Use of and/or registration on any portion of this site constitutes acceptance of our <a href="https://www.condenast.com/user-agreement/">User Agreement</a> (updated 1/1/20) and <a href="https://www.condenast.com/privacy-policy/">Privacy Policy and Cookie Statement</a> (updated 1/1/20) and <a href="https://arstechnica.com/amendment-to-conde-nast-user-agreement-privacy-policy/">Ars Technica Addendum</a> (effective 8/21/2018). Ars may earn compensation on sales from links on this site. <a href="https://arstechnica.com/affiliate-link-policy/">Read our affiliate link policy</a>.<br>
<a href="https://www.condenast.com/privacy-policy/#california">Your California Privacy Rights</a> | <a id="ot-sdk-btn" class="ot-sdk-show-settings">Do Not Sell My Personal Information</a><br>
The material on this site may not be reproduced, distributed, transmitted, cached or otherwise used, except with the prior written permission of Cond&#xE9; Nast.<br>
<a href="https://www.condenast.com/online-behavioral-advertising-oba-and-how-to-opt-out-of-oba/#clickheretoreadmoreaboutonlinebehavioraladvertising(oba)">Ad Choices</a>
</p>
</section>
</footer>
</div>
</body>
</html>

File diff suppressed because one or more lines are too long

@ -30,7 +30,6 @@ export default async function collectAllPages({
html,
$,
metaCache,
contentOnly: true,
extractedTitle: title,
previousUrls,
};

@ -0,0 +1,63 @@
export const ArstechnicaComExtractor = {
domain: 'arstechnica.com',
// Articles from this site are often paginated, but I was unable to write a CSS
// selector to find the next page. On the last page, there will be a link with a CSS
// selector indicating that the previous page is next. But the parser appears to find
// the next page without this extractor finding it, as long as the fallback option is
// left at its default value of true.
title: {
selectors: ['title'],
},
author: {
selectors: ['*[rel="author"] *[itemprop="name"]'],
},
date_published: {
selectors: [['.byline time', 'datetime']],
},
dek: {
selectors: ['h2[itemprop="description"]'],
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: ['div[itemprop="articleBody"]'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
h2: $node => {
// Some pages have an element h2 that is significant, and that the parser will
// remove if not following a paragraph. Adding this empty paragraph fixes it, and
// the empty paragraph will be removed anyway.
$node.before('<p></p>');
},
},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result.
clean: [
// Remove enlarge links and separators inside image captions.
'figcaption .enlarge-link',
'figcaption .sep',
// I could not transform the video into usable elements, so I
// removed them.
'figure.video',
// Image galleries that do not work.
'.gallery',
'aside',
'.sidebar',
],
},
};

@ -0,0 +1,159 @@
import assert from 'assert';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
import { excerptContent } from 'utils/text';
const fs = require('fs');
describe('ArstechnicaComExtractor', () => {
describe('initial test case', () => {
let result;
let url;
beforeAll(() => {
url =
'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const html = fs.readFileSync(
'./fixtures/arstechnica.com/1587927767738.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
it('is selected properly', () => {
// This test should be passing by default.
// It sanity checks that the correct parser
// is being selected for URLs from this domain
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/arstechnica.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(
title,
`The connected renter: How to make your apartment smarter`
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/arstechnica.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
assert.equal(author, `Valentina Palladino`);
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/arstechnica.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
assert.equal(date_published, `2016-08-10T11:15:53.000Z`);
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/arstechnica.com/index.js.
const { dek } = await result;
// Update these values with the expected values from
// the article.
assert.equal(
dek,
'Turning your rented space into a smart home can be tricky; we have some advice.'
);
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/arstechnica.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
`https://cdn.arstechnica.net/wp-content/uploads/2016/07/smartapartment_hero5-640x215.jpg`
);
});
// it('returns the pages_rendered', async () => {
// // To pass this test, fill out the pages_rendered selector
// // in ./src/extractors/custom/arstechnica.com/index.js.
// const { pages_rendered } = await result
//
// // Update these values with the expected values from
// // the article.
// assert.equal(pages_rendered, `3`)
// });
it('returns the content', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/arstechnica.com/index.js.
// You may also want to make use of the clean and transform
// options.
const { content } = await result;
const $ = cheerio.load(content || '');
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(
first13,
"Name a home appliance or product, and there's probably a smart version of"
);
});
});
describe('Keep the first h2 on subsequent pages test', () => {
let result;
let url;
beforeAll(() => {
url =
'https://arstechnica.com/science/2020/04/should-you-wear-a-face-mask-heres-all-the-data-we-have/2/';
const html = fs.readFileSync(
'./fixtures/arstechnica.com/1587927767738.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
it('is selected properly', () => {
// This test should be passing by default.
// It sanity checks that the correct parser
// is being selected for URLs from this domain
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the content', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/arstechnica.com/index.js.
// You may also want to make use of the clean and transform
// options.
const { content } = await result;
const $ = cheerio.load(content || '');
const h2 = $('h2');
assert.equal(h2.length, 1);
});
});
});

@ -139,3 +139,4 @@ export * from './pastebin.com';
export * from './www.abendblatt.de';
export * from './www.gruene.de';
export * from './www.engadget.com';
export * from './arstechnica.com';

Loading…
Cancel
Save