jisho : code refactoring

This commit is contained in:
Alexandre Flament 2022-04-02 15:21:58 +02:00
parent 19fa0095a0
commit 74c7aee9ec

View File

@ -17,7 +17,6 @@ about = {
} }
categories = ['dictionaries'] categories = ['dictionaries']
engine_type = 'online_dictionary'
paging = False paging = False
URL = 'https://jisho.org' URL = 'https://jisho.org'
@ -34,19 +33,19 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
infoboxed = False first_result = True
search_results = resp.json() search_results = resp.json()
pages = search_results.get('data', [])
for page in pages: for page in search_results.get('data', []):
# Entries that are purely from Wikipedia are excluded. # Entries that are purely from Wikipedia are excluded.
if page['senses'][0]['parts_of_speech'] != [] and page['senses'][0]['parts_of_speech'][0] == 'Wikipedia definition': parts_of_speech = page.get('senses') and page['senses'][0].get('parts_of_speech')
if parts_of_speech and parts_of_speech[0] == 'Wikipedia definition':
pass pass
# Process alternative forms # Process alternative forms
japanese = page['japanese']
alt_forms = [] alt_forms = []
for title_raw in japanese: for title_raw in page['japanese']:
if 'word' not in title_raw: if 'word' not in title_raw:
alt_forms.append(title_raw['reading']) alt_forms.append(title_raw['reading'])
else: else:
@ -54,74 +53,84 @@ def response(resp):
if 'reading' in title_raw: if 'reading' in title_raw:
title += ' (' + title_raw['reading'] + ')' title += ' (' + title_raw['reading'] + ')'
alt_forms.append(title) alt_forms.append(title)
# Process definitions
definitions = [] #
def_raw = page['senses'] result_url = urljoin(BASE_URL, page['slug'])
for defn_raw in def_raw: definitions = get_definitions(page)
extra = ''
if not infoboxed:
# Extra data. Since they're not documented, this implementation is based solely by the author's assumptions.
if defn_raw['tags'] != []:
if defn_raw['info'] != []:
extra += defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ' # "usually written as kana: <kana>"
else:
extra += ', '.join(defn_raw['tags']) + '. ' # abbreviation, archaism, etc.
elif defn_raw['info'] != []:
extra += ', '.join(defn_raw['info']).capitalize() + '. ' # inconsistent
if defn_raw['restrictions'] != []:
extra += 'Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. '
extra = extra[:-1]
definitions.append((
', '.join(defn_raw['parts_of_speech']),
'; '.join(defn_raw['english_definitions']),
extra
))
content = ''
infobox_content = '''
<small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a>
and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a>
by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small><ul>
'''
for pos, engdef, extra in definitions:
if pos == 'Wikipedia definition':
infobox_content += '</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>'
if pos == '':
infobox_content += f"<li>{engdef}"
else:
infobox_content += f"<li><i>{pos}</i>: {engdef}"
if extra != '':
infobox_content += f" ({extra})"
infobox_content += '</li>'
content += f"{engdef}. "
infobox_content += '</ul>'
# For results, we'll return the URL, all alternative forms (as title), # For results, we'll return the URL, all alternative forms (as title),
# and all definitions (as description) truncated to 300 characters. # and all definitions (as description) truncated to 300 characters.
content = " ".join(f"{engdef}." for _, engdef, _ in definitions)
results.append({ results.append({
'url': urljoin(BASE_URL, page['slug']), 'url': result_url,
'title': ", ".join(alt_forms), 'title': ", ".join(alt_forms),
'content': content[:300] + (content[300:] and '...') 'content': content[:300] + (content[300:] and '...')
}) })
# Like Wordnik, we'll return the first result in an infobox too. # Like Wordnik, we'll return the first result in an infobox too.
if not infoboxed: if first_result:
infoboxed = True first_result = False
infobox_urls = [] results.append(get_infobox(alt_forms, result_url, definitions))
infobox_urls.append({
'title': 'Jisho.org',
'url': urljoin(BASE_URL, page['slug'])
})
infobox = {
'infobox': alt_forms[0],
'urls': infobox_urls
}
alt_forms.pop(0)
alt_content = ''
if len(alt_forms) > 0:
alt_content = '<p><i>Other forms:</i> '
alt_content += ", ".join(alt_forms)
alt_content += '</p>'
infobox['content'] = alt_content + infobox_content
results.append(infobox)
return results return results
def get_definitions(page):
# Process definitions
definitions = []
for defn_raw in page['senses']:
extra = []
# Extra data. Since they're not documented, this implementation is based solely by the author's assumptions.
if defn_raw.get('tags'):
if defn_raw.get('info'):
# "usually written as kana: <kana>"
extra.append(defn_raw['tags'][0] + ', ' + defn_raw['info'][0] + '. ')
else:
# abbreviation, archaism, etc.
extra.append(', '.join(defn_raw['tags']) + '. ')
elif defn_raw.get('info'):
# inconsistent
extra.append(', '.join(defn_raw['info']).capitalize() + '. ')
if defn_raw.get('restrictions'):
extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ')
definitions.append((
', '.join(defn_raw['parts_of_speech']),
'; '.join(defn_raw['english_definitions']),
''.join(extra)[:-1],
))
return definitions
def get_infobox(alt_forms, result_url, definitions):
infobox_content = []
# title & alt_forms
infobox_title = alt_forms[0]
if len(alt_forms) > 1:
infobox_content.append(f'<p><i>Other forms:</i> {", ".join(alt_forms[1:])}</p>')
# definitions
infobox_content.append('''
<small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a>
and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a>
by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small>
<ul>
''')
for pos, engdef, extra in definitions:
if pos == 'Wikipedia definition':
infobox_content.append('</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>')
pos = f'<i>{pos}</i>: ' if pos else ''
extra = f' ({extra})' if extra else ''
infobox_content.append(f'<li>{pos}{engdef}{extra}</li>')
infobox_content.append('</ul>')
#
return {
'infobox': infobox_title,
'content': ''.join(infobox_content),
'urls': [
{
'title': 'Jisho.org',
'url': result_url,
}
]
}