From 3ec35a06f7840f6e387eb4f518f6673394efb21a Mon Sep 17 00:00:00 2001 From: rinpatch Date: Thu, 26 Apr 2018 22:42:31 +0300 Subject: [PATCH 1/9] Added Engine --- searx/engines/acgsou.py | 85 +++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 +++ 2 files changed, 91 insertions(+) create mode 100644 searx/engines/acgsou.py diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py new file mode 100644 index 00000000..10655cb8 --- /dev/null +++ b/searx/engines/acgsou.py @@ -0,0 +1,85 @@ +""" + Acgsou (Japanese Animation/Music/Comics Bittorrent tracker) + + @website https://www.acgsou.com/ + @provide-api no + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, seed, leech, torrentfile +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode +from searx.utils import get_torrent_size, int_or_zero + +# engine dependent config +categories = ['files', 'images', 'videos', 'music'] +paging = True + +# search-url +base_url = 'https://www.acgsou.com/' +search_url = base_url + 'search.php?{query}&page={offset}' +# xpath queries +xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]' +xpath_category = './/td[2]/a[1]' +xpath_title = './/td[3]/a[last()]' +xpath_torrent_links = './/td[3]/a' +xpath_filesize = './/td[4]/text()' + +# do search-request +def request(query, params): + query = urlencode({'keyword': query}) + params['url'] = search_url.format(query=query, offset=params['pageno']) + return params + + +# get response from search-request +def response(resp): + results = [] + dom = html.fromstring(resp.text) + print(resp.text) + for result in dom.xpath(xpath_results): + # defaults + filesize = 0 + magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" + torrent_link = "" + + # category in which our torrent belongs + try: + category = extract_text(result.xpath(xpath_category)[0]) + except: + pass + + # torrent title + page_a = result.xpath(xpath_title)[0] + title = extract_text(page_a) + + # link to the page + href = base_url + page_a.attrib.get('href') + + #magnet link + magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5]) + + # let's try to calculate the torrent size + try: + filesize_info = result.xpath(xpath_filesize)[0] + filesize = filesize_info[:-2] + filesize_multiplier = filesize_info[-2:] + filesize = get_torrent_size(filesize, filesize_multiplier) + except : + pass + + # content string contains all information not included into template + content = 'Category: "{category}".' + content = content.format(category=category) + + results.append({'url': href, + 'title': title, + 'content': content, + 'filesize': filesize, + 'magnetlink': magnet_link, + 'template': 'torrent.html'}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index d72d01a5..988076f8 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -433,6 +433,12 @@ engines: engine : nyaa shortcut : nt disabled : True + + - name : acgsou + engine : acgsou + shortcut : acg + disabled : True + timeout: 5.0 - name : openairedatasets engine : json_engine From fb364ffae7f5a41ab4b6b4ecbc2a9194da532a5c Mon Sep 17 00:00:00 2001 From: rinpatch Date: Fri, 27 Apr 2018 08:23:58 +0300 Subject: [PATCH 2/9] Hopefully fix code style errors --- searx/engines/acgsou.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index 10655cb8..ad423457 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -28,14 +28,11 @@ xpath_title = './/td[3]/a[last()]' xpath_torrent_links = './/td[3]/a' xpath_filesize = './/td[4]/text()' -# do search-request def request(query, params): query = urlencode({'keyword': query}) params['url'] = search_url.format(query=query, offset=params['pageno']) return params - -# get response from search-request def response(resp): results = [] dom = html.fromstring(resp.text) @@ -46,23 +43,17 @@ def response(resp): magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" torrent_link = "" - # category in which our torrent belongs try: category = extract_text(result.xpath(xpath_category)[0]) except: pass - # torrent title page_a = result.xpath(xpath_title)[0] title = extract_text(page_a) - - # link to the page href = base_url + page_a.attrib.get('href') - - #magnet link + magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5]) - # let's try to calculate the torrent size try: filesize_info = result.xpath(xpath_filesize)[0] filesize = filesize_info[:-2] @@ -70,16 +61,14 @@ def response(resp): filesize = get_torrent_size(filesize, filesize_multiplier) except : pass - - # content string contains all information not included into template + #I didn't add download/seed/leech count since as I figured out they are generated randowmly everytime content = 'Category: "{category}".' content = content.format(category=category) - + results.append({'url': href, 'title': title, 'content': content, 'filesize': filesize, 'magnetlink': magnet_link, 'template': 'torrent.html'}) - return results From b8b23d800640c3fb07e0110008ddb633277b045e Mon Sep 17 00:00:00 2001 From: rinpatch Date: Fri, 27 Apr 2018 08:29:17 +0300 Subject: [PATCH 3/9] Hopefully fix code style errors(again) --- searx/engines/acgsou.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index ad423457..5a157101 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -28,16 +28,18 @@ xpath_title = './/td[3]/a[last()]' xpath_torrent_links = './/td[3]/a' xpath_filesize = './/td[4]/text()' + def request(query, params): query = urlencode({'keyword': query}) params['url'] = search_url.format(query=query, offset=params['pageno']) return params + def response(resp): results = [] dom = html.fromstring(resp.text) print(resp.text) - for result in dom.xpath(xpath_results): + for result in dom.xpath(xpath_results): # defaults filesize = 0 magnet_link = "magnet:?xt=urn:btih:{}&tr=http://tracker.acgsou.com:2710/announce" @@ -59,12 +61,12 @@ def response(resp): filesize = filesize_info[:-2] filesize_multiplier = filesize_info[-2:] filesize = get_torrent_size(filesize, filesize_multiplier) - except : + except: pass #I didn't add download/seed/leech count since as I figured out they are generated randowmly everytime content = 'Category: "{category}".' content = content.format(category=category) - + results.append({'url': href, 'title': title, 'content': content, From 86f47689fea8df37571e8413223fe1a3847fb0b8 Mon Sep 17 00:00:00 2001 From: rinpatch Date: Fri, 27 Apr 2018 08:32:43 +0300 Subject: [PATCH 4/9] Hopefully fix code style errors(again)(again) --- searx/engines/acgsou.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index 5a157101..73f0361e 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -63,7 +63,7 @@ def response(resp): filesize = get_torrent_size(filesize, filesize_multiplier) except: pass - #I didn't add download/seed/leech count since as I figured out they are generated randowmly everytime + # I didn't add download/seed/leech count since as I figured out they are generated randowmly everytime content = 'Category: "{category}".' content = content.format(category=category) From e3bd2670d7d886ba7fb7540781fec130d1e80c9c Mon Sep 17 00:00:00 2001 From: rinpatch Date: Fri, 27 Apr 2018 08:36:11 +0300 Subject: [PATCH 5/9] Traling whitespace --- searx/engines/acgsou.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index 73f0361e..ebe25364 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -36,7 +36,7 @@ def request(query, params): def response(resp): - results = [] + results = [] dom = html.fromstring(resp.text) print(resp.text) for result in dom.xpath(xpath_results): From dcc9fdb47fd16aa65216846dc52470ada016753c Mon Sep 17 00:00:00 2001 From: rinpatch Date: Fri, 27 Apr 2018 15:36:15 +0300 Subject: [PATCH 6/9] Added unit test --- searx/engines/acgsou.py | 2 +- tests/unit/engines/test_acgsou.py | 67 +++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 tests/unit/engines/test_acgsou.py diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index ebe25364..c1d8cccf 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -63,7 +63,7 @@ def response(resp): filesize = get_torrent_size(filesize, filesize_multiplier) except: pass - # I didn't add download/seed/leech count since as I figured out they are generated randowmly everytime + # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime content = 'Category: "{category}".' content = content.format(category=category) diff --git a/tests/unit/engines/test_acgsou.py b/tests/unit/engines/test_acgsou.py new file mode 100644 index 00000000..2c3e6608 --- /dev/null +++ b/tests/unit/engines/test_acgsou.py @@ -0,0 +1,67 @@ +from collections import defaultdict +import mock +from searx.engines import acgsou +from searx.testing import SearxTestCase + + +class TestAcgsouEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dic = defaultdict(dict) + dic['pageno'] = 1 + params = acgsou.request(query, dic) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('acgsou.com' in params['url']) + + def test_response(self): + resp = mock.Mock(text='') + self.assertEqual(acgsou.response(resp), []) + + html = """ + + + + tablehead + + + + + + + + + + + + + +
datetestcategory + torrentname + 1MB + + 29 + + + + 211 + + + + 168 + + user
+ """ + + resp = mock.Mock(text=html) + results = acgsou.response(resp) + + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + + r = results[0] + self.assertEqual(r['url'], 'https://www.acgsou.com/show-torrentid.html') + self.assertEqual(r['content'], 'Category: "testcategory".') + self.assertEqual(r['title'], 'torrentname') + self.assertEqual(r['filesize'], 1048576) From f17e54a396a77f8668d227a41d37737aff340a84 Mon Sep 17 00:00:00 2001 From: rinpatch Date: Fri, 27 Apr 2018 15:40:42 +0300 Subject: [PATCH 7/9] Forgot to remove print command used for debugging --- searx/engines/acgsou.py | 1 - 1 file changed, 1 deletion(-) diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index c1d8cccf..9cedeeec 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -38,7 +38,6 @@ def request(query, params): def response(resp): results = [] dom = html.fromstring(resp.text) - print(resp.text) for result in dom.xpath(xpath_results): # defaults filesize = 0 From 1ac0c90037923c06a337b7236b678d8ca2b45e5f Mon Sep 17 00:00:00 2001 From: rinpatch Date: Fri, 27 Apr 2018 17:33:23 +0300 Subject: [PATCH 8/9] Fix unit test --- tests/unit/engines/test_acgsou.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/unit/engines/test_acgsou.py b/tests/unit/engines/test_acgsou.py index 2c3e6608..d115c532 100644 --- a/tests/unit/engines/test_acgsou.py +++ b/tests/unit/engines/test_acgsou.py @@ -20,14 +20,22 @@ class TestAcgsouEngine(SearxTestCase): self.assertEqual(acgsou.response(resp), []) html = """ + - tablehead + + + + + + + + - + +
发布时间分类资源名称大小种子下载完成发布者/联盟
date testcategory @@ -51,7 +59,9 @@ class TestAcgsouEngine(SearxTestCase): user
+ """ resp = mock.Mock(text=html) From a79c676f2766c00122695fe74ad42ff89cfdcd07 Mon Sep 17 00:00:00 2001 From: rinpatch Date: Fri, 27 Apr 2018 17:38:52 +0300 Subject: [PATCH 9/9] Remove Japanese characters --- tests/unit/engines/test_acgsou.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unit/engines/test_acgsou.py b/tests/unit/engines/test_acgsou.py index d115c532..c3ea4805 100644 --- a/tests/unit/engines/test_acgsou.py +++ b/tests/unit/engines/test_acgsou.py @@ -24,14 +24,14 @@ class TestAcgsouEngine(SearxTestCase): - - - - - - - - + + + + + + + +
发布时间分类资源名称大小种子下载完成发布者/联盟testtesttesttesttesttesttesttest