Text summarizer based on `sumy`. Uses `ftfy` to keep sanity in unicode
as it appears on www, and `dominate` to generate html without fuss.master
parent
2ed6489594
commit
4b6b7f9ffb
@ -0,0 +1,66 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division, print_function, unicode_literals
|
||||||
|
|
||||||
|
import dominate
|
||||||
|
from ftfy import fix_text
|
||||||
|
from sumy.parsers.html import HtmlParser
|
||||||
|
from sumy.nlp.tokenizers import Tokenizer
|
||||||
|
from sumy.parsers.plaintext import PlaintextParser
|
||||||
|
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
|
||||||
|
from sumy.nlp.stemmers import Stemmer
|
||||||
|
from sumy.utils import get_stop_words
|
||||||
|
from dominate.tags import *
|
||||||
|
|
||||||
|
from qutescript import userscript
|
||||||
|
|
||||||
|
LANGUAGE = "english"
|
||||||
|
SENTENCES_COUNT = 10
|
||||||
|
|
||||||
|
|
||||||
|
def generate_html(sentences, title_text):
|
||||||
|
doc = dominate.document(title='Summary')
|
||||||
|
|
||||||
|
with doc.head:
|
||||||
|
style("""\
|
||||||
|
body {
|
||||||
|
background-color: #F9F8F1;
|
||||||
|
color: #2C232A;
|
||||||
|
font-family: sans-serif;
|
||||||
|
font-size: 2.6em;
|
||||||
|
margin: 3em 1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
""")
|
||||||
|
|
||||||
|
with doc:
|
||||||
|
div(id='header').add(h1(title_text))
|
||||||
|
with div():
|
||||||
|
attr(cls='body')
|
||||||
|
for sentence in sentences:
|
||||||
|
p(sentence)
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@userscript
|
||||||
|
def summarize_text(request):
|
||||||
|
if request.html:
|
||||||
|
parser = HtmlParser.from_file(file_path=request.html,
|
||||||
|
url=request.url,
|
||||||
|
tokenizer=Tokenizer(LANGUAGE))
|
||||||
|
else:
|
||||||
|
parser = PlaintextParser.from_file(file_path=request.html,
|
||||||
|
tokenizer=Tokenizer(LANGUAGE))
|
||||||
|
|
||||||
|
stemmer = Stemmer(LANGUAGE)
|
||||||
|
|
||||||
|
summarizer = Summarizer(stemmer)
|
||||||
|
summarizer.stop_words = get_stop_words(LANGUAGE)
|
||||||
|
sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)]
|
||||||
|
html = generate_html(sentences, fix_text(request.title)).render()
|
||||||
|
request.send_html(html)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
summarize_text()
|
@ -0,0 +1,3 @@
|
|||||||
|
dominate
|
||||||
|
ftfy
|
||||||
|
sumy
|
Loading…
Reference in New Issue