From a11948c71bfe7b2aac6e50e7634874d5073c7d84 Mon Sep 17 00:00:00 2001 From: marc Date: Sat, 29 Oct 2016 21:04:01 -0500 Subject: [PATCH] Add language support for more engines. --- searx/engines/dailymotion.py | 18 +++++++++++++++ searx/engines/duckduckgo.py | 22 +------------------ searx/engines/gigablast.py | 2 +- searx/engines/qwant.py | 15 ++++++++++++- searx/engines/startpage.py | 5 +++++ searx/engines/swisscows.py | 8 +++++++ searx/engines/yandex.py | 4 +++- searx/languages.py | 4 +++- tests/unit/engines/test_duckduckgo.py | 2 +- .../engines/test_duckduckgo_definitions.py | 4 ++++ tests/unit/engines/test_google.py | 2 +- tests/unit/engines/test_qwant.py | 2 +- tests/unit/engines/test_swisscows.py | 2 +- tests/unit/engines/test_wikipedia.py | 2 +- utils/update_languages.py | 13 ++++------- 15 files changed, 66 insertions(+), 39 deletions(-) diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 317f34f59..4a7d7b6a8 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -20,6 +20,24 @@ from datetime import datetime categories = ['videos'] paging = True language_support = True +supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az", + "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca", + "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da", + "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo", + "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv", + "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr", + "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik", + "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk", + "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", + "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml", + "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv", + "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc", + "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu", + "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se", + "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss", + "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th", + "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur", + "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"] # search-url # see http://www.dailymotion.com/doc/api/obj-video.html diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index a1cb5882c..3e1752dd0 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -16,7 +16,6 @@ from urllib import urlencode from lxml.html import fromstring from searx.engines.xpath import extract_text -from searx.languages import language_codes # engine dependent config categories = ['general'] @@ -76,26 +75,7 @@ def request(query, params): else: # tries to get a country code from language locale = locale[0].lower() - lang_codes = [x[0] for x in language_codes] - for lc in lang_codes: - lc = lc.split('-') - if locale == lc[0] and len(lc) == 2: - locale = lc[1].lower() + '-' + lc[0].lower() - break - - if locale: - params['url'] = url.format( - query=urlencode({'q': query, 'kl': locale}), offset=offset) - else: - locale = params['language'].split('-') - if len(locale) == 2: - # country code goes first - locale = locale[1].lower() + '-' + locale[0].lower() - else: - # tries to get a country code from language - locale = locale[0].lower() - lang_codes = [x[0] for x in language_codes] - for lc in lang_codes: + for lc in supported_languages: lc = lc.split('-') if locale == lc[0]: locale = lc[1].lower() + '-' + lc[0].lower() diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 125ffa0a6..f012e1df2 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -44,7 +44,7 @@ supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el", "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr", "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv", - "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] + "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] # do search-request diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index d8b084292..200e9ada9 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -20,6 +20,11 @@ from searx.utils import html_to_text categories = None paging = True language_support = True +supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT", + "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU", + "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR", + "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH", + "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"] category_to_keyword = {'general': 'web', 'images': 'images', @@ -46,7 +51,15 @@ def request(query, params): # add language tag if specified if params['language'] != 'all': - params['url'] += '&locale=' + params['language'].lower() + locale = params['language'].split('-') + if len(locale) == 2 and params['language'] in supported_languages: + params['url'] += '&locale=' + params['language'].replace('-', '_').lower() + else: + # try to get a country code for language + for lang in supported_languages: + if locale[0] == lang.split('-')[0]: + params['url'] += '&locale=' + lang.replace('-', '_').lower() + break return params diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 54aafdee5..3814d9949 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -24,6 +24,11 @@ categories = ['general'] # paging = False language_support = True +supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW", + "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr", + "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja", + "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw", + "sv", "tl", "th", "tr", "uk", "vi"] # search-url base_url = 'https://startpage.com/' diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 68632a15a..5c6b051a9 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -18,6 +18,12 @@ import re categories = ['general', 'images'] paging = True language_support = True +supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", + "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR", + "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT", + "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", + "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH", + "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"] # search-url base_url = 'https://swisscows.ch/' @@ -35,6 +41,8 @@ def request(query, params): if params['language'] == 'all': ui_language = 'browser' region = 'browser' + elif params['language'].split('-')[0] == 'no': + region = 'nb-NO' else: region = params['language'] ui_language = params['language'].split('-')[0] diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index eee345c45..65aee28b8 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -22,7 +22,9 @@ language_support = True # TODO default_tld = 'com' language_map = {'ru': 'ru', - 'ua': 'uk', + 'ua': 'ua', + 'be': 'by', + 'kk': 'kz', 'tr': 'com.tr'} # search-url diff --git a/searx/languages.py b/searx/languages.py index dddd380d4..a6cbdb9b1 100644 --- a/searx/languages.py +++ b/searx/languages.py @@ -100,7 +100,7 @@ language_codes = ( (u"sa", u"संस्कृतम्", u"", u"Sanskrit"), (u"he-IL", u"עברית", u"", u"Hebrew"), (u"se", u"Sámegiella", u"", u"Northern Sami"), - (u"sd", u"سنڌي، سندھی ، सिन्ध", u"", u"Sindhi"), + (u"sd", u"سنڌي ،सिन्ध", u"", u"Sindhi"), (u"fr-CH", u"Français", u"", u"French"), (u"zea", u"Zeêuws", u"", u"Zeelandic"), (u"it-CH", u"Italiano", u"", u"Italian"), @@ -191,6 +191,7 @@ language_codes = ( (u"jam", u"Jamaican Creole English", u"", u"Patois"), (u"udm", u"Удмурт кыл", u"", u"Udmurt"), (u"ksh", u"Ripoarisch", u"", u"Ripuarian"), + (u"sl-SI", u"Slovenščina", u"", u"Slovenian"), (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"), (u"de", u"Deutsch", u"", u"German"), (u"da", u"Dansk", u"", u"Danish"), @@ -284,6 +285,7 @@ language_codes = ( (u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"), (u"ca-CT", u"Català", u"", u"Catalan"), (u"en-MY", u"English", u"", u"English"), + (u"olo", u"Livvi-Karelian", u"", u"Livvinkarjala"), (u"sv-SE", u"Svenska", u"", u"Swedish"), (u"de-AT", u"Deutsch", u"", u"German"), (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"), diff --git a/tests/unit/engines/test_duckduckgo.py b/tests/unit/engines/test_duckduckgo.py index 734f2c39e..b5a4fd4f0 100644 --- a/tests/unit/engines/test_duckduckgo.py +++ b/tests/unit/engines/test_duckduckgo.py @@ -11,7 +11,7 @@ class TestDuckduckgoEngine(SearxTestCase): query = 'test_query' dicto = defaultdict(dict) dicto['pageno'] = 1 - dicto['language'] = 'de_CH' + dicto['language'] = 'de-CH' dicto['time_range'] = '' params = duckduckgo.request(query, dicto) self.assertIn('url', params) diff --git a/tests/unit/engines/test_duckduckgo_definitions.py b/tests/unit/engines/test_duckduckgo_definitions.py index 39da64175..feafe47ba 100644 --- a/tests/unit/engines/test_duckduckgo_definitions.py +++ b/tests/unit/engines/test_duckduckgo_definitions.py @@ -21,10 +21,14 @@ class TestDDGDefinitionsEngine(SearxTestCase): query = 'test_query' dicto = defaultdict(dict) dicto['pageno'] = 1 + dicto['language'] = 'es' params = duckduckgo_definitions.request(query, dicto) self.assertIn('url', params) self.assertIn(query, params['url']) self.assertIn('duckduckgo.com', params['url']) + self.assertIn('headers', params) + self.assertIn('Accept-Language', params['headers']) + self.assertIn('es', params['headers']['Accept-Language']) def test_response(self): self.assertRaises(AttributeError, duckduckgo_definitions.response, None) diff --git a/tests/unit/engines/test_google.py b/tests/unit/engines/test_google.py index 8e73e2ab7..c83eb3bf0 100644 --- a/tests/unit/engines/test_google.py +++ b/tests/unit/engines/test_google.py @@ -18,7 +18,7 @@ class TestGoogleEngine(SearxTestCase): query = 'test_query' dicto = defaultdict(dict) dicto['pageno'] = 1 - dicto['language'] = 'fr_FR' + dicto['language'] = 'fr-FR' dicto['time_range'] = '' params = google.request(query, dicto) self.assertIn('url', params) diff --git a/tests/unit/engines/test_qwant.py b/tests/unit/engines/test_qwant.py index 7d79d13d8..c4c0b0690 100644 --- a/tests/unit/engines/test_qwant.py +++ b/tests/unit/engines/test_qwant.py @@ -10,7 +10,7 @@ class TestQwantEngine(SearxTestCase): query = 'test_query' dicto = defaultdict(dict) dicto['pageno'] = 0 - dicto['language'] = 'fr_FR' + dicto['language'] = 'fr-FR' qwant.categories = [''] params = qwant.request(query, dicto) self.assertIn('url', params) diff --git a/tests/unit/engines/test_swisscows.py b/tests/unit/engines/test_swisscows.py index 3b4ce7b0f..dbbc044da 100644 --- a/tests/unit/engines/test_swisscows.py +++ b/tests/unit/engines/test_swisscows.py @@ -10,7 +10,7 @@ class TestSwisscowsEngine(SearxTestCase): query = 'test_query' dicto = defaultdict(dict) dicto['pageno'] = 1 - dicto['language'] = 'de_DE' + dicto['language'] = 'de-DE' params = swisscows.request(query, dicto) self.assertTrue('url' in params) self.assertTrue(query in params['url']) diff --git a/tests/unit/engines/test_wikipedia.py b/tests/unit/engines/test_wikipedia.py index d1c44036d..431cf69c7 100644 --- a/tests/unit/engines/test_wikipedia.py +++ b/tests/unit/engines/test_wikipedia.py @@ -10,7 +10,7 @@ class TestWikipediaEngine(SearxTestCase): def test_request(self): query = 'test_query' dicto = defaultdict(dict) - dicto['language'] = 'fr_FR' + dicto['language'] = 'fr-FR' params = wikipedia.request(query, dicto) self.assertIn('url', params) self.assertIn(query, params['url']) diff --git a/utils/update_languages.py b/utils/update_languages.py index cb230c210..6f86742c6 100644 --- a/utils/update_languages.py +++ b/utils/update_languages.py @@ -41,7 +41,6 @@ def valid_code(lang_code): if len(lang_code) > 2 or len(lang_code[0]) > 3: return False if len(lang_code) == 2 and len(lang_code[1]) > 2: - print lang_code return False return True @@ -62,8 +61,8 @@ def get_wikipedia_languages(): english_name = td[1].xpath('./a')[0].text articles = int(td[4].xpath('./a/b')[0].text.replace(',','')) - # exclude languages with few articles and language variants - if code not in languages and articles >= 100 and valid_code(code): + # exclude language variants and languages with few articles + if code not in languages and articles >= 1000 and valid_code(code): languages[code] = (name, '', english_name) @@ -90,7 +89,7 @@ def join_language_lists(): # try to get language name language = languages.get(locale.split('-')[0], None) if language == None: - # print engine_name + ": " + locale + print engine_name + ": " + locale continue (name, country, english) = language @@ -117,12 +116,8 @@ def write_languages_file(): new_file.close() -def main(): +if __name__ == "__main__": get_wikipedia_languages() get_google_languages() join_language_lists() write_languages_file() - - -if __name__ == "__main__": - main()