From 9a961aa5955b2b683aa562e927842c1c7a90ee4e Mon Sep 17 00:00:00 2001 From: John Brayton Date: Wed, 10 Aug 2022 18:16:14 -0400 Subject: [PATCH] feat: Add a custom extractor for www.ndtv.com. (#554) * feat:Add a custom extractor for ma.ttias.be. When parsing content for cron.weekly issues, such as the one at https://ma.ttias.be/cronweekly/issue-130/, Mercury Parser would remove headings and ordered lists that were part of the content. This resolves that as follows: * Remove "id" attributes from "h1" and "h2" elements. Those attributes would result in the elements having a low weight. * Since Mercury Parser demotes "h1" elements to "h2", demote "h2" elements to "h3". * Add class="entry-content-asset" to "ul" elements to avoid them being removed. * removed redundant comment. * feat: Add a custom extractor for engadget.com. * feat: Add a custom extractor for www.ndtv.com. * Works, but I need to figure how to make pagination work correctly. * fixed pagination - would only retrieve first or second page because we would send contentOnly: true on subsequent pages (page 2). removed failover: true from preview. * rolled back { fallback: false } option removal * Clarified comments. * rolling back yarn.lock changes Co-authored-by: John Holdun --- fixtures/www.ndtv.com/1587821636077.html | 17 +++ src/extractors/custom/index.js | 1 + src/extractors/custom/www.ndtv.com/index.js | 54 ++++++++ .../custom/www.ndtv.com/index.test.js | 120 ++++++++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 fixtures/www.ndtv.com/1587821636077.html create mode 100644 src/extractors/custom/www.ndtv.com/index.js create mode 100644 src/extractors/custom/www.ndtv.com/index.test.js diff --git a/fixtures/www.ndtv.com/1587821636077.html b/fixtures/www.ndtv.com/1587821636077.html new file mode 100644 index 00000000..64eb4f76 --- /dev/null +++ b/fixtures/www.ndtv.com/1587821636077.html @@ -0,0 +1,17 @@ +Coronavirus News: Donald Trump Talks Retaliation If India Rejects Export Of Key Drug +

COVID-19: Trump Talks "Retaliation" If India Rejects Export Of Key Drug

Amid rising pressure, the government is likely to take a decision on the matter today and clear the move after calculating sufficient stocks for the country, sources have told NDTV.

COVID-19: Trump Talks 'Retaliation' If India Rejects Export Of Key Drug

Coronavirus: US President Donald Trump said he spoke to PM Modi on Sunday.

Highlights

  • Hydroxychloroquine is a key drug used in the treatment of COVID-19
  • The centre announced a hold on export of hydroxychloroquine on March 25
  • India is one of the world's largest manufacturers of the medicine
Washington/ New Delhi:

US President Donald Trump has said "there may be retaliation" if India does not agree to export hydroxychloroquine, a key anti-malaria drug believed by many to be effective in the treatment of coronavirus. His remarks come nearly two weeks after the government banned export of the drug as experts test its efficacy in helping treat COVID-19 patients.

"I would be surprised if he (Prime Minister Narendra Modi) would, you know, because India does very well with the United States," Mr Trump said at a White House press briefing on Monday, when a reporter asked whether he was worried about "retaliation to the US ban on export of medical goods" from India.

"I don't like that decision, I didn't hear that that was his decision. I know that he stopped it for other countries. I spoke to him yesterday, we had a very good talk and we'll see whether or not that's his... For many years, they've been taken advantage of the United States on trade. So I would be surprised if that were his decision. He'd have to tell me that. I spoke to him Sunday morning, called him, and I said, we'd appreciate you allowing our supply to come out. If he doesn't allow it to come out. That would be OK. But of course, there may be retaliation. Why wouldn't there be," the US President said.

Trump has described hydroxychloroquine as a "game-changer" though it is yet to be established as an effective cure for COVID-19.

Last month, the Indian Council of Medical Research or ICMR had recommended the use of hydroxychloroquine for treating healthcare workers serving coronavirus patients.

In a notification on March 25, the government announced a hold on the medicine's export, adding that it would be allowed only on "case-to case basis" on humanitarian grounds. India has reported over 4,000 coronavirus patients so far and more than 100 deaths. 

This morning, however, the government said that key drugs linked to COVID-19 cure will be supplied to "nations that have been badly affected" by the pandemic.   

"In view of the humanitarian aspects of the pandemic, it has been decided that India would licence paracetamol and Hydroxychloroquine in appropriate quantities to all our neighbouring countries who are dependent on our capabilities. We will also be supplying these essential drugs to some nations who have been particularly badly affected by the pandemic. We would therefore discourage any speculation in this regard or any attempts to politicise the matter," Foreign Ministry spokesperson Anurag Srivastava said.

Currently, the US has the most number of coronavirus cases in the world. The coronavirus death count has crossed 10,000-mark in US; over 366,000 have contracted infection so far.

Last week, the US Food and Drug Administration (FDA) issued an Emergency Use Authorization or EUA to permit the emergency use of hydroxychloroquine sulfate supplied from the national stockpile to treat adult and adolescent COVID-19 patients who weigh 50 kg or more and are hospitalised, according to an official statement.

World

28,25,140Cases
18,29,648Active
7,97,826Recovered
1,97,666Deaths
Coronavirus has spread to 185 countries. The total confirmed cases worldwide are 28,25,140 and 1,97,666 have died; 18,29,648 are active cases and 7,97,826 have recovered as on April 25, 2020 at 5:36 pm.

India

24,942 1490Cases
18,953 1038Active
5,210 396Recovered
779 56Deaths
In India, there are 24,942 confirmed cases including 779 deaths. The number of active cases is 18,953 and 5,210 have recovered as on April 25, 2020 at 5:00 pm.

State & District Details

+ State + + Cases + + Active + + Recovered + + Deaths +
DistrictCases
Mumbai3029
Pune660
Thane465
Nashik96
Nagpur76
Mumbai Sub Ur67
Yavatmal32
Aurangabad30
Sangli27
Ahmednagar27
Sholapur21
Buldhana21
Palaghar17
Akola17
Raigad17
Satara14
Kolhapur8
Latur8
Ratnagiri7
Amravati6
Jalgaon3
Osmanabad3
Sindhudurg (kudal)2
Chandrapur2
Dhule2
Beed1
Washim1
Parbhani1
Jalna1
Nandurbar1
Gondia1
Hingoli1
Details Awaited*2153

6817 387

5559 252

957 117

301 18

DistrictCases
Ahmedabad1298
Surat338
Vadodara188
Rajkot40
Bhavnagar32
Anand28
Bharuch23
Gandhi Nagar17
Patan15
Narmada12
Panchmahal11
Banaskantha10
Arvalli8
Chhota Udepur7
Kutch6
Mehsana6
Botad5
Porbandar3
Mahisagar3
Kheda3
Gir Somnath3
Dahod3
Sabarkantha2
Valsad2
Morbi1
Jamnagar1
Tapi1
Details Awaited*749

2815 191

2423 169

265 7

127 15

DistrictCases
Tj From Quaritine1080
Others Non Tracable213
Central Delhi184
South East130
West Delhi122
South Delhi70
North Delhi60
Shahadara48
South West Delhi42
East Delhi38
New Delhi37
North West Delhi32
North East Delhi25
Details Awaited*433

2514 138

1604 86

857 49

53 3

DistrictCases
Jaipur537
Jodhpur228
Bharatpur102
Kota99
Tonk95
Banswara60
Iran Evacuees60
Nagaur58
Jhunjhunu37
Bikaner35
Jaisalmer32
Bhilwara28
Ajmer23
Jhalawar20
Churu14
Dausa13
Alwar7
Dungarpur5
Sawai Madhopur5
Udaipur4
Hanumangarh3
Karauli3
Pali2
Pratapgarh2
Sikar2
Dholpur1
Barmer1
Details Awaited*558

2034 70

1777 70

230

27

DistrictCases
Indore915
Bhopal277
Khargon41
Dhaar41
Khandwa (east Nimar)32
Ujjain29
Raisen29
Hoshangabad25
Jabalpur25
Barwani24
Dewas18
Vidisha13
Ratlam13
Morena13
Mandsaur9
Agar8
Shajapur6
Sheopur-kalan4
Alirajpur3
Gwalior3
Sagar2
Shivpuri2
Chindwara2
Betul2
Rajgarh1
Dindori1
Tikamgarh1
Details Awaited*413

1952 100

1650 84

210 7

92 9

DistrictCases
Agra241
Lucknow167
Gautam Budha Nagar98
Meerut75
Saharanpur72
Kanpur59
Firozabad58
Moradabad58
Ghaziabad41
Shamli26
Bijnor26
Basti19
Bulandshahar18
Sitapur17
Hapur17
Amroha17
Baghpat15
Rampur15
Varanasi14
Budaun13
Azamgarh7
Auraiya7
Sambhal7
Mathura6
Maharajganj6
Ghazipur6
Kannauj6
Pratapgarh6
Bareilly6
Muzaffar Nagar5
Jaunpur5
Lakhimpur Kheri4
Mainpuri4
Hathras4
Kanshi Ram Nagar (kasganj)3
Mirzapur3
Etah3
Etawah3
Kaushambi2
Banda2
Pilibhit2
Hardoi2
Rae-bareilly2
Unnao1
Sultanpur1
Bara-banki1
Sant Kabir Nagar1
Shahjahanpur1
Sant Ravi Das Nagar (bhadoi)1
Mau1
Gonda1
Allahabad1
Details Awaited*602

1778 174

1504 130

248 42

26 2

DistrictCases
Chennai303
Coimbatore133
Tiruppur109
Dindigul76
Erode70
Tirunelveli62
Chengalpattu53
Tiruchirapalli50
Namakkal50
Thiruvallur48
Thanjavur46
Madurai46
Nagapattinam44
Theni43
Karur42
Ranipet39
Villupuram36
Thoothukudi27
Thiruvarur27
Cuddalore26
Tenkasi26
Salem24
Vellore22
Virudhunagar19
Tirupattur17
Nagerkoil (kanyakumari)16
Sivagangai12
Tiruvannamalai12
Ramanathapuram11
Udagamandalam9
Kancheepuram9
Perambalur5
Ariyalur4
Kalllakurichi3
Pudukottai1
Details Awaited*235

1755 72

867

866 114

22 2

DistrictCases
Kurnool158
Guntur128
Krishna76
Nellore67
Prakasam44
Kadapa37
West Godavari35
Anantapur29
Chittoor28
East Godavari24
Visakhapatnam20
Details Awaited*415

1061 106

859 78

171 26

31 2

DistrictCases
Hyderabad472
Suryapet75
Nizamabad56
Vikarabad36
Hyderabad Rural (ranga Reddy)35
Gadwal (jogulamba)32
Medchal30
Warangal25
Adilabad19
Karimnagar19
Nirmal18
Yadadri15
Nalgonda15
Mahaboobnagar12
Kamareddy11
Sangareddy8
Khammam7
Medak6
Asifabad (komarambhim)5
Kothagudem (badadri)4
Jagityal3
Siricilla (rajanna)3
Bhupalpally (jayashanker)3
Nagarkurnool2
Peddapally2
Mulugu2
Janagoan2
Siddipet1
Mahabubabad1
Details Awaited*65

984

705

253

26

DistrictCases
Kolkata184
Howrah79
North 24 Parganas46
Purba Mednipur21
Hooghly12
South 24 Parganas9
Kalimpomg7
Paschim Burdwan7
Nadia6
Jalpaiguri5
Darjeeling4
Paschim Mednipur4
Diamond Harbour-hd3
Murshidabad2
Basirhat-hd1
Purba Burdwan1
Nandigram-hd1
Details Awaited*179

571 57

450 54

103

18 3

DistrictCases
Mysuru84
Bbmp66
Belagavi42
Vijayapura32
Kalburgi27
Bengaluru (u)23
Bagalkote21
Chikkaballapur16
Bidar15
Dakshin Kannada14
Ballari13
Mandya12
Bengaluru (r)12
Uttara Kannada11
Dharwad7
Gadag4
Udupi3
Tumakuru2
Davanagere2
Kodagu1
Chitradurga1
Details Awaited*81

489 26

318 23

153 3

18

DistrictCases
Bandipora81
Srinagar79
Baramulla43
Kupwara31
Jammu27
Shopian22
Udhampur20
Ganderbal14
Badgam13
Kulgam6
Rajouri4
Samba4
Anantnag3
Pulwama3
Details Awaited*104

454 27

340 10

109 17

5

DistrictCases
Kasaragode170
Cannanore(kannur)92
Ernakulam24
Kozhicode (calicut)20
Malappuram20
Pathanamthitta17
Thiruvananthapuram14
Thrissur13
Idukki10
Kollam9
Palakkad8
Alappuzha5
Kottayam3
Wyanad3
Details Awaited*43

451 3

116

331 7

4 1

DistrictCases
Sasnagar61
Jalandhar48
Patiala26
Pathankot24
Nawanshahr (sbs Nagar)19
Ludhiana16
Amritsar11
Mansa11
Hoshiarpur7
Moga4
Rupnagar3
Sangrur3
Faridkot3
Kapurthala2
Barnala2
Fatehgarh Sahib (sarhind)2
Muktsar1
Gurdaspur1
Firozepur1
Details Awaited*53

298 21

214 18

67 2

17 1

DistrictCases
Nuh57
Faridabad42
Gurgaon38
Palwal34
Panchkula18
Ambala12
Sonepat7
Karnal6
Panipat5
Sirsa4
Yamunanagar3
Bhiwani3
Kurukshetra2
Kaithal2
Jind2
Hissar2
Rohtak1
Fatehabad1
Charkhi Dadri1
Details Awaited*32

272

113

156

3

DistrictCases
Siwan29
Nalanda28
Munger20
Begusarai9
Patna7
Gaya5
Buxar4
Gopalganj3
Nawada3
Bhagalpur1
Bhojpur1
Lakhisarai1
Saran1
Vaishali1
Details Awaited*115

228 52

180 52

46

2

DistrictCases
Khurda (bhuvaneshwar)46
Bhadrak8
Balasore3
Jajpur2
Kalahandi2
Kendrapara2
Sundargarh2
Cuttack1
Dhenkanal1
Puri1
Details Awaited*26

94 4

60 4

33

1

DistrictCases
Ranchi25
Bokaro10
Hazaribagh3
Dhanbad2
Simdega2
Deoghar1
Giridh1
Koderma1
Details Awaited*14

59 4

47 3

9 1

3

DistrictCases
Dehradun24
Nainital9
Haridwar7
Udhamsingh Nagar4
Almora1
Pauri Garhwal1
Details Awaited*2

48 1

23

25 1

0

DistrictCases
Una16
Solan9
Chamba6
Kangra5
Hamirpur2
Sirmaur1
Details Awaited*1

40

21

18

1

DistrictCases
Korba28
Raipur5
Bilaspur1
Durg1
Rajnandgaon1

36

6

30 2

0

DistrictCases
Golaghat9
Marigaon6
Dhubri4
Goalpara4
Nalbari4
Kamrup Metro2
Cachar1
Hailakandi1
Kamrup1
Karimganj1
Lakhimpur1
S Mancachar1
Details Awaited*1

36

16

19

1

DistrictCases
Chandigarh26
Details Awaited*2

28 1

13

15 1

0

DistrictCases
South Andaman16
Details Awaited*11

27 5

16 5

11

0

DistrictCases
Leh (ladakh)14
Details Awaited*6

20 2

6 2

14

0

DistrictCases
East Khasi Hills11
Details Awaited*1

12

11

0

1

DistrictCases
Puducherry6
Details Awaited*1

7

4

3

0

DistrictCases
North Goa6
South Goa1

7

0

7

0

DistrictCases
Imphal West1
Thoubal1

2

0

2

0

DistrictCases
Gomati1
North Tripura1

2

1

1

0

DistrictCases
Aizwal West1

1

1

0

0

DistrictCases
Lohit1

1

0

1

0

Listen to the latest songs, only on JioSaavn.com
diff --git a/src/extractors/custom/index.js b/src/extractors/custom/index.js index d3ea4e5e..f070b7de 100644 --- a/src/extractors/custom/index.js +++ b/src/extractors/custom/index.js @@ -140,3 +140,4 @@ export * from './www.abendblatt.de'; export * from './www.gruene.de'; export * from './www.engadget.com'; export * from './arstechnica.com'; +export * from './www.ndtv.com'; diff --git a/src/extractors/custom/www.ndtv.com/index.js b/src/extractors/custom/www.ndtv.com/index.js new file mode 100644 index 00000000..daa85b22 --- /dev/null +++ b/src/extractors/custom/www.ndtv.com/index.js @@ -0,0 +1,54 @@ +export const WwwNdtvComExtractor = { + domain: 'www.ndtv.com', + + title: { + selectors: [['meta[name="og:title"]', 'value'], 'h1.entry-title'], + }, + + author: { + selectors: ['span[itemprop="author"] span[itemprop="name"]'], + }, + + date_published: { + selectors: [['span[itemprop="dateModified"]', 'content']], + }, + + dek: { + selectors: ['h2'], + }, + + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']], + }, + + content: { + selectors: ['div[itemprop="articleBody"]'], + + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + // This site puts a dateline in a 'b' above the first paragraph, and then somehow + // blends it into the first paragraph with CSS. This transform moves the dateline + // to the first paragraph. + '.place_cont': $node => { + if (!$node.parents('p').length) { + const nextSibling = $node.next('p'); + if (nextSibling) { + $node.remove(); + nextSibling.prepend($node); + } + } + }, + }, + + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [ + '.highlghts_Wdgt', + '.ins_instory_dv_caption', + 'input', + '._world-wrapper .mt20', + ], + }, +}; diff --git a/src/extractors/custom/www.ndtv.com/index.test.js b/src/extractors/custom/www.ndtv.com/index.test.js new file mode 100644 index 00000000..f39f3703 --- /dev/null +++ b/src/extractors/custom/www.ndtv.com/index.test.js @@ -0,0 +1,120 @@ +import assert from 'assert'; +import URL from 'url'; +import cheerio from 'cheerio'; + +import Mercury from 'mercury'; +import getExtractor from 'extractors/get-extractor'; +import { excerptContent } from 'utils/text'; + +const fs = require('fs'); + +describe('WwwNdtvComExtractor', () => { + describe('initial test case', () => { + let result; + let url; + beforeAll(() => { + url = + 'https://www.ndtv.com/india-news/coronavirus-us-president-donald-trump-says-there-may-be-retaliation-if-india-doesnt-clear-export-of-2207327'; + const html = fs.readFileSync( + './fixtures/www.ndtv.com/1587821636077.html' + ); + result = Mercury.parse(url, { html, fallback: false }); + }); + + it('is selected properly', () => { + // This test should be passing by default. + // It sanity checks that the correct parser + // is being selected for URLs from this domain + const extractor = getExtractor(url); + assert.equal(extractor.domain, URL.parse(url).hostname); + }); + + it('returns the title', async () => { + // To pass this test, fill out the title selector + // in ./src/extractors/custom/www.ndtv.com/index.js. + const { title } = await result; + + // Update these values with the expected values from + // the article. + assert.equal( + title, + `COVID-19: Trump Talks "Retaliation" If India Rejects Export Of Key Drug` + ); + }); + + it('returns the author', async () => { + // To pass this test, fill out the author selector + // in ./src/extractors/custom/www.ndtv.com/index.js. + const { author } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(author, 'Swati Bhasin'); + }); + + it('returns the date_published', async () => { + // To pass this test, fill out the date_published selector + // in ./src/extractors/custom/www.ndtv.com/index.js. + const { date_published } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(date_published, '2020-04-07T10:19:34.000Z'); + }); + + it('returns the dek', async () => { + // To pass this test, fill out the dek selector + // in ./src/extractors/custom/www.ndtv.com/index.js. + const { dek } = await result; + + // Update these values with the expected values from + // the article. + assert.equal( + dek, + 'Amid rising pressure, the government is likely to take a decision on the matter today and clear the move after calculating sufficient stocks for the country, sources have told NDTV.' + ); + }); + + it('returns the lead_image_url', async () => { + // To pass this test, fill out the lead_image_url selector + // in ./src/extractors/custom/www.ndtv.com/index.js. + const { lead_image_url } = await result; + + // Update these values with the expected values from + // the article. + assert.equal( + lead_image_url, + `https://c.ndtvimg.com/2020-04/u9vkhue_donald-trump-white-house-afp_625x300_04_April_20.jpg` + ); + }); + + it('returns the content', async () => { + // To pass this test, fill out the content selector + // in ./src/extractors/custom/www.ndtv.com/index.js. + // You may also want to make use of the clean and transform + // options. + const { content } = await result; + + const $ = cheerio.load(content || ''); + + const first13 = excerptContent( + $('*') + .first() + .text(), + 13 + ); + + // Update these values with the expected values from + // the article. + assert.equal( + first13, + 'Washington/ New Delhi: US President Donald Trump has said "there may be retaliation"' + ); + + // Confirm that the dateline is moved. + const dateline = $('.place_cont'); + assert.equal(dateline.length, 1); + assert.equal(dateline.get(0).parent.tagName, 'p'); + }); + }); +});