mirror of https://github.com/Krazybug/calishot
First public version
commit
3848c62530
@ -0,0 +1,5 @@
|
||||
*.pyc
|
||||
calishot.egg-info/
|
||||
.vscode
|
||||
.DS_Store
|
||||
output/
|
@ -0,0 +1,108 @@
|
||||
# CALISHOT Guidelines
|
||||
|
||||
## Installation
|
||||
|
||||
You need poetry pre installed.
|
||||
Clone the repository then :
|
||||
|
||||
|
||||
```
|
||||
poetry install
|
||||
poetry shell
|
||||
mkdir output
|
||||
cd output
|
||||
```
|
||||
Then create a list.txt file with all your calibre urls
|
||||
|
||||
## Indexing
|
||||
|
||||
```
|
||||
python ../calishot import list.txt
|
||||
|
||||
python ../calishot check
|
||||
|
||||
sqlite-utils sites.db 'select url from sites where status="online" ' | jq -r '.[].url' > online.txt
|
||||
|
||||
python ../calishot index-site-list online.txt
|
||||
|
||||
python ../calishot build-index --english
|
||||
mv index.db index-eng.db
|
||||
|
||||
python ../calishot build-index --noenglish
|
||||
mv index.db index-non-eng.db
|
||||
|
||||
# for diplaying global size and total count of formats
|
||||
python ../calishot get-stats
|
||||
|
||||
python ../calishot index-to-json | jq -r '. | {title: .title.label, authors, year, language, publisher, series, desc: .title.href, tags, identifiers, formats, format_links: [.links[].href]}' > calibre.json
|
||||
|
||||
sqlite-utils index.db 'select uuid, title, authors, year, series, language, formats, publisher, tags, identifiers from summary where instr(formats, "mp3") >0 order by uuid limit 101'
|
||||
|
||||
|
||||
|
||||
```
|
||||
## Deployment
|
||||
|
||||
1. Install poetry, datasette and it's plugins
|
||||
|
||||
```
|
||||
poetry new calishot
|
||||
poetry shell
|
||||
poetry add datasette
|
||||
poetry add datasette-json-html
|
||||
poetry add datasette-pretty-json
|
||||
```
|
||||
|
||||
You can eventually install it with virtualenv/pip if you don't want to use poetry:
|
||||
|
||||
```
|
||||
python -m venv calishot
|
||||
. ./calishot/bin/activate
|
||||
pip install datasette
|
||||
pip install datasette-json-html
|
||||
pip install datasette-pretty-json
|
||||
````
|
||||
|
||||
|
||||
2. Prepare the calishot settings:
|
||||
|
||||
Download the sqlite db file to the same directory and then
|
||||
|
||||
|
||||
```
|
||||
cat <<EOF > metadata.json
|
||||
{
|
||||
"databases": {
|
||||
"index": {
|
||||
"tables": {
|
||||
"summary": {
|
||||
"sort": "title",
|
||||
"searchmode": "raw"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
```
|
||||
|
||||
You can now run a local test:
|
||||
|
||||
```
|
||||
datasette serve index-non-eng.db --config sql_time_limit_ms:50000 --config allow_download:off --config max_returned_rows:2000 --config num_sql_threads:10 --config allow_csv_stream:off --metadata metadata.json
|
||||
```
|
||||
|
||||
Open your browser to http://localhost:8001/ and check the result.
|
||||
|
||||
3. Now you're ready to publish :)
|
||||
|
||||
Install [heroku-cli](https://devcenter.heroku.com/articles/heroku-cli) then :
|
||||
|
||||
export NODE_EXTRA_CA_CERTS=<your_dir>/calishot/CAall.cer
|
||||
|
||||
```
|
||||
heroku login -i
|
||||
|
||||
|
||||
datasette publish heroku index-non-eng.db -n calishot-non-eng-1 --install=datasette-json-html --install=datasette-pretty-json --extra-options="--config sql_time_limit_ms:50000 --config allow_download:off --config num_sql_threads:10 --config max_returned_rows:500 --config allow_csv_stream:off" --metadata metadata.json
|
||||
```
|
@ -0,0 +1 @@
|
||||
__version__ = '0.1.0'
|
@ -0,0 +1,19 @@
|
||||
import fire
|
||||
|
||||
from site_index import import_urls_from_file, check_calibre_list, check_calibre_site
|
||||
from calistat import index_site_list, get_stats, index_site_list_seq
|
||||
from ebooks_index import build_index, index_to_json
|
||||
from diff import diff
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire({
|
||||
"import": import_urls_from_file,
|
||||
"check":check_calibre_list,
|
||||
"check-site":check_calibre_site,
|
||||
"index-site-list": index_site_list,
|
||||
"index-site-list-seq": index_site_list_seq,
|
||||
"build-index": build_index,
|
||||
"get-stats": get_stats,
|
||||
"index-to-json": index_to_json,
|
||||
"diff": diff
|
||||
})
|
@ -0,0 +1,514 @@
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
import shutil
|
||||
from typing import Dict
|
||||
import requests
|
||||
import json
|
||||
from humanize import naturalsize as hsize
|
||||
import humanize
|
||||
from langid.langid import LanguageIdentifier, model
|
||||
import iso639
|
||||
import time
|
||||
import json
|
||||
import unidecode
|
||||
|
||||
from requests.adapters import HTTPAdapter
|
||||
import urllib.parse
|
||||
import urllib3
|
||||
from pathlib import Path
|
||||
import uuid
|
||||
from sqlite_utils import Database
|
||||
|
||||
import gevent
|
||||
from gevent import monkey
|
||||
from gevent import Timeout
|
||||
from gevent.pool import Pool
|
||||
monkey.patch_socket()
|
||||
# monkey.patch_all()
|
||||
import fire
|
||||
|
||||
from site_index import init_sites_db, get_libs_from_site
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
|
||||
|
||||
def get_site_db(uuid, dir):
|
||||
f_uuid=str(uuid)+".db"
|
||||
print(f_uuid)
|
||||
path = Path(dir) / str(f_uuid)
|
||||
return Database(path)
|
||||
|
||||
|
||||
|
||||
def init_site_db(site, _uuid="", dir="."):
|
||||
|
||||
if not _uuid:
|
||||
s_uuid=str(uuid.uuid4())
|
||||
else:
|
||||
s_uuid=str(_uuid)
|
||||
|
||||
f_uuid=s_uuid+".db"
|
||||
path = Path(dir) / f_uuid
|
||||
db = Database(path)
|
||||
|
||||
|
||||
if not "site" in db.table_names():
|
||||
s=db["site"]
|
||||
s.insert(
|
||||
{
|
||||
"uuid": s_uuid,
|
||||
"urls": [site],
|
||||
"version": "",
|
||||
"major": 0,
|
||||
"schema_version": 1,
|
||||
}
|
||||
, pk='uuid'
|
||||
)
|
||||
|
||||
|
||||
if not "ebooks" in db.table_names():
|
||||
db["ebooks"].create({
|
||||
"uuid": str,
|
||||
"id": int,
|
||||
"library": str, #TODO: manage libraries ids as integer to prevent library renam on remote site
|
||||
"title": str,
|
||||
"authors": str,
|
||||
"series": str,
|
||||
"series_index": int,
|
||||
# "edition": int,
|
||||
"language": str,
|
||||
"desc": str,
|
||||
"identifiers": str,
|
||||
"tags": str,
|
||||
"publisher": str,
|
||||
"pubdate": str,
|
||||
"last_modified": str,
|
||||
"timestamp": str,
|
||||
"formats": str,
|
||||
"cover": int,
|
||||
# "epub": int,
|
||||
# "mobi": int,
|
||||
# "pdf": int,
|
||||
# TODO: add the most common formats to avoid alter tables
|
||||
}, pk="uuid")
|
||||
|
||||
if not "libraries" in db.table_names():
|
||||
db["libraries"].create({
|
||||
"id": int,
|
||||
"names": str
|
||||
}, pk="id")
|
||||
|
||||
|
||||
# db.table("ebooks", pk="id")
|
||||
# db.table("ebooks", pk="id", alter=True
|
||||
|
||||
return db
|
||||
|
||||
|
||||
def get_format_url(db, book, format):
|
||||
url = json.loads(list(db['site'].rows)[0]["urls"])[0]
|
||||
library=book['library']
|
||||
id_=str(book['id'])
|
||||
|
||||
f_url = url+"/get/"+format+"/"+id_+"/"+library
|
||||
return f_url
|
||||
|
||||
|
||||
|
||||
def get_desc_url(db, book):
|
||||
url = json.loads(list(db['site'].rows)[0]["urls"])[0]
|
||||
|
||||
library=book['library']
|
||||
id_=str(book['id'])
|
||||
|
||||
f_urls=[]
|
||||
|
||||
major= list(db['site'].rows)[0]["major"]
|
||||
|
||||
if major >= 3:
|
||||
d_url =url+"#book_id="+id_+"&library_id="+library+"&panel=book_details"
|
||||
else:
|
||||
d_url =url+"/browse/book/"+id_
|
||||
|
||||
return d_url
|
||||
|
||||
|
||||
def save_books_metadata_from_site(db, books):
|
||||
uuid = list(db['site'].rows)[0]["uuid"]
|
||||
|
||||
# print(uuid)
|
||||
|
||||
ebooks_t=db["ebooks"]
|
||||
|
||||
|
||||
# print([c[1] for c in ebooks_t.columns])
|
||||
# for b in books:
|
||||
# print(b['title'])
|
||||
# ebooks_t.insert(b, alter=True)
|
||||
|
||||
# ebooks_t.insert_all(books, alter=True)
|
||||
ebooks_t.insert_all(books, alter=True, pk='uuid', batch_size=1000)
|
||||
# print([c[1] for c in ebooks_t.columns])
|
||||
|
||||
def load_metadata(dir, uuid):
|
||||
pass
|
||||
|
||||
def update_done_status(book):
|
||||
source=book['source']
|
||||
if source['status']!='ignored':
|
||||
if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()):
|
||||
book['source']['status']="done"
|
||||
else:
|
||||
book['source']['status']="todo"
|
||||
|
||||
def index_site_list_seq(file):
|
||||
with open(file) as f:
|
||||
for s in f.readlines():
|
||||
# try:
|
||||
# index_ebooks(s.rstrip())
|
||||
# except:
|
||||
# continue
|
||||
index_ebooks(s.rstrip())
|
||||
|
||||
def index_site_list(file):
|
||||
pool = Pool(40)
|
||||
|
||||
with open(file) as f:
|
||||
sites = f.readlines()
|
||||
sites= [s.rstrip() for s in sites]
|
||||
print(sites)
|
||||
pool.map(index_ebooks_except, sites)
|
||||
|
||||
def index_ebooks_except(site):
|
||||
try:
|
||||
index_ebooks(site)
|
||||
except:
|
||||
print("Error on site")
|
||||
|
||||
def index_ebooks(site, library="", start=0, stop=0, dir=".", num=1000, force_refresh=False):
|
||||
|
||||
#TODO old calibres don't manage libraries. /ajax/library-info endpoint doesn't exist. It would be better to manage calibre version directly
|
||||
|
||||
libs=[]
|
||||
try:
|
||||
libs= get_libs_from_site(site)
|
||||
except:
|
||||
print("old lib")
|
||||
|
||||
_uuid=str(uuid.uuid4())
|
||||
|
||||
if libs:
|
||||
for lib in libs:
|
||||
index_ebooks_from_library(site=site, _uuid=_uuid, library=lib, start=start, stop=stop, dir=dir, num=num, force_refresh=force_refresh)
|
||||
else:
|
||||
index_ebooks_from_library(site=site, _uuid=_uuid, start=start, stop=stop, dir=dir, num=num, force_refresh=force_refresh)
|
||||
|
||||
def index_ebooks_from_library(site, _uuid="", library="", start=0, stop=0, dir=".", num=1000, force_refresh=False):
|
||||
|
||||
offset= 0 if not start else start-1
|
||||
num=min(1000, num)
|
||||
server=site.rstrip('/')
|
||||
api=server+'/ajax/'
|
||||
lib=library
|
||||
library= '/'+library if library else library
|
||||
|
||||
timeout=15
|
||||
|
||||
print(f"\nIndexing library: {lib} from server: {server} ")
|
||||
url=api+'search'+library+'?num=0'
|
||||
print(f"\nGetting ebooks count of library: {lib} from server:{server} ")
|
||||
# print(url)
|
||||
|
||||
try:
|
||||
r=requests.get(url, verify=False, timeout=(timeout, 30))
|
||||
r.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
print("Unable to open site:", url)
|
||||
return
|
||||
# pass
|
||||
except Exception as e:
|
||||
print ("Other issue:", e)
|
||||
return
|
||||
# pass
|
||||
except :
|
||||
print("Wazza !!!!")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
total_num=int(r.json()["total_num"])
|
||||
total_num= total_num if not stop else stop
|
||||
print()
|
||||
print(f"Total count={total_num} from {server}")
|
||||
|
||||
# library=r.json()["base_url"].split('/')[-1]
|
||||
# base_url=r.json()["base_url"]
|
||||
|
||||
# cache_db=init_cache_db(dir=dir)
|
||||
# _uuid=get_uuid_from_url(cache_db)
|
||||
db=init_site_db(site, _uuid=_uuid, dir=dir)
|
||||
r_site = (list(db['site'].rows)[0])
|
||||
|
||||
r_site['version']=r.headers['server']
|
||||
r_site['major']=int(re.search('calibre.(\d).*', r.headers['server']).group(1))
|
||||
db["site"].upsert(r_site, pk='uuid')
|
||||
|
||||
print()
|
||||
|
||||
range=offset+1
|
||||
while offset < total_num:
|
||||
remaining_num = min(num, total_num - offset)
|
||||
# print()
|
||||
# print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
|
||||
print ('\r {:180.180}'.format(f'Downloading ids: offset={str(offset)} count={str(remaining_num)} from {server}'), end='')
|
||||
|
||||
# url=server+base_url+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
|
||||
url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
|
||||
|
||||
# print("->", url)
|
||||
try:
|
||||
r=requests.get(url, verify=False, timeout=(timeout, 30))
|
||||
r.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
print ("Connection issue:", e)
|
||||
return
|
||||
# pass
|
||||
except Exception as e:
|
||||
print ("Other issue:", e)
|
||||
return
|
||||
# pass
|
||||
except :
|
||||
print ("Wazza !!!!")
|
||||
return
|
||||
# print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))
|
||||
|
||||
# print()
|
||||
# print("Downloading metadata from", str(offset+1), "to", str(offset+remaining_num))
|
||||
print ('\r {:180.180}'.format(f'Downloading metadata from {str(offset+1)} to {str(offset+remaining_num)}/{total_num} from {server}'), end='')
|
||||
books_s=",".join(str(i) for i in r.json()['book_ids'])
|
||||
url=api+'books'+library+'?ids='+books_s
|
||||
# url=server+base_url+'/books?ids='+books_s
|
||||
# print("->", url)
|
||||
# print ('\r{:190.190}'.format(f'url= {url} ...'), end='')
|
||||
|
||||
try:
|
||||
r=requests.get(url, verify=False, timeout=(60, 60))
|
||||
r.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
print ("Connection issue:", e)
|
||||
return
|
||||
# pass
|
||||
except Exception as e:
|
||||
print ("Other issue:", e)
|
||||
return
|
||||
# pass
|
||||
except :
|
||||
print ("Wazza !!!!")
|
||||
return
|
||||
# print(len(r.json()), "received")
|
||||
print ('\r {:180.180}'.format(f'{len(r.json())} received'), end='')
|
||||
|
||||
|
||||
books=[]
|
||||
for id, r_book in r.json().items():
|
||||
uuid=r_book['uuid']
|
||||
if not uuid:
|
||||
print ("No uuid for ebook: ignored")
|
||||
continue
|
||||
|
||||
|
||||
if r_book['authors']:
|
||||
desc= f"({r_book['title']} / {r_book['authors'][0]})"
|
||||
else:
|
||||
desc= f"({r_book['title']})"
|
||||
|
||||
# print (f'\r--> {range}/{total_num} - {desc}', end='')
|
||||
# print (f'\r{server}--> {range}/{total_num} - {desc}', end='')
|
||||
print ('\r {:180.180} '.format(f'{range}/{total_num} ({server} : {uuid} --> {desc}'), end='')
|
||||
|
||||
|
||||
if not force_refresh:
|
||||
# print("Checking local metadata:", uuid)
|
||||
try:
|
||||
book = load_metadata(dir, uuid)
|
||||
except:
|
||||
print("Unable to get metadata from:", uuid)
|
||||
range+=1
|
||||
continue
|
||||
if book:
|
||||
print("Metadata already present for:", uuid)
|
||||
range+=1
|
||||
continue
|
||||
|
||||
if not r_book['formats']:
|
||||
# print("No format found for {}".format(r_book['uuid']))
|
||||
range+=1
|
||||
continue
|
||||
|
||||
book={}
|
||||
book['uuid']=r_book['uuid']
|
||||
book['id']=id
|
||||
book['library']=lib
|
||||
|
||||
# book['title']=r_book['title']
|
||||
book['title']=unidecode.unidecode(r_book['title'])
|
||||
# book['authors']=r_book['authors']
|
||||
|
||||
if r_book['authors']:
|
||||
book['authors']=[unidecode.unidecode(s) for s in r_book['authors']]
|
||||
# book['desc']=""
|
||||
|
||||
book['desc']=r_book['comments']
|
||||
|
||||
if r_book['series']:
|
||||
book['series']=unidecode.unidecode(r_book['series'])
|
||||
# book['series']=[unidecode.unidecode(s) for s in r_book['series']]
|
||||
s_i=r_book['series_index']
|
||||
if (s_i):
|
||||
book['series_index']=int(s_i)
|
||||
|
||||
# book['edition']=0
|
||||
|
||||
book['identifiers']=r_book['identifiers']
|
||||
|
||||
# book['tags']=r_book['tags']
|
||||
if r_book['tags']:
|
||||
book['tags']=[unidecode.unidecode(s) for s in r_book['tags']]
|
||||
|
||||
book['publisher']=r_book['publisher']
|
||||
# book['publisher']=unidecode.unidecode(r_book['publisher'])
|
||||
|
||||
book['pubdate']=r_book['pubdate']
|
||||
|
||||
if not r_book['languages']:
|
||||
# if True:
|
||||
text=r_book['title']+". "
|
||||
if r_book['comments']:
|
||||
text=r_book['comments']
|
||||
s_language, prob=identifier.classify(text)
|
||||
if prob >= 0.85:
|
||||
language = iso639.to_iso639_2(s_language)
|
||||
book['language']=language
|
||||
else:
|
||||
book['language']=''
|
||||
else:
|
||||
book['language']=iso639.to_iso639_2(r_book['languages'][0])
|
||||
|
||||
if r_book['cover']:
|
||||
book['cover']= True
|
||||
else:
|
||||
book['cover']= False
|
||||
|
||||
book['last_modified']=r_book['last_modified']
|
||||
book['timestamp']=r_book['timestamp']
|
||||
|
||||
book['formats']=[]
|
||||
formats=r_book['formats']
|
||||
for f in formats:
|
||||
if 'size' in r_book['format_metadata'][f]:
|
||||
size=int(r_book['format_metadata'][f]['size'])
|
||||
else:
|
||||
# print()
|
||||
# print(f"Size not found for format '{f}' uuid={uuid}: skipped")
|
||||
pass
|
||||
#TODO query the size when the function to rebuild the full url is ready
|
||||
#
|
||||
# print("Trying to get size online: {}".format('url'))
|
||||
# try:
|
||||
# size=get_file_size(s['url'])
|
||||
# except:
|
||||
# print("Unable to access size for format '{}' : {} skipped".format(f, uuid))
|
||||
# continue
|
||||
book[f]=(size)
|
||||
book['formats'].append(f)
|
||||
|
||||
if not book['formats']:
|
||||
# if not c_format:
|
||||
# print()
|
||||
# print(f"No format found for {book['uuid']} id={book['id']} : skipped")
|
||||
range+=1
|
||||
# continue
|
||||
|
||||
|
||||
books.append(book)
|
||||
range+=1
|
||||
|
||||
# print()
|
||||
print("Saving metadata")
|
||||
print ('\r {:180.180}'.format(f'Saving metadata from {server}'), end='')
|
||||
|
||||
try:
|
||||
save_books_metadata_from_site(db, books)
|
||||
print('\r {:180.180}'.format(f'--> Saved {range-1}/{total_num} ebooks from {server}'), end='')
|
||||
except BaseException as err:
|
||||
print (err)
|
||||
|
||||
print()
|
||||
print()
|
||||
|
||||
# try:
|
||||
# save_metadata(db, books)
|
||||
# except:
|
||||
# print("Unable to save book metadata")
|
||||
|
||||
offset=offset+num
|
||||
|
||||
|
||||
|
||||
|
||||
def query(query_str="", dir="."):
|
||||
dbs=[]
|
||||
for path in os.listdir(dir):
|
||||
db = Database(path)
|
||||
# print (db["ebooks"].count)
|
||||
# for row in db["site"].rows:
|
||||
# print (f'{row["urls"]}: {db["ebooks"].count}')
|
||||
# db["ebooks"].search(query_str)
|
||||
# url=db['site'].get(1)['urls'][0]
|
||||
url=db['site'].get(1)
|
||||
print (url)
|
||||
|
||||
for ebook in db["ebooks"].rows_where(query_str):
|
||||
# print (f"{ebook['title']} ({ebook['uuid']})")
|
||||
print (ebook)
|
||||
|
||||
|
||||
|
||||
def get_stats(dir="."):
|
||||
dbs=[]
|
||||
size=0
|
||||
count=0
|
||||
for f in os.listdir(dir):
|
||||
if not f.endswith(".db"):
|
||||
continue
|
||||
if f == "index.db":
|
||||
continue
|
||||
path = Path(dir) / f
|
||||
dbs.append(Database(path))
|
||||
|
||||
for db in dbs:
|
||||
for i, ebook in enumerate(db["ebooks"].rows):
|
||||
uuid=ebook['uuid']
|
||||
title=ebook['title']
|
||||
formats=json.loads(ebook['formats'])
|
||||
# print(formats)
|
||||
for f in formats:
|
||||
if f in ebook:
|
||||
if ebook[f]:
|
||||
size+=ebook[f]
|
||||
count+=1
|
||||
# print (f'\r{count} {f} --> {uuid}: {title}', end ='')
|
||||
# print (f'\r{count} : {uuid} --> {f}', end='')
|
||||
print (f'\r{count} formats - ebook : {uuid}', end='')
|
||||
|
||||
print()
|
||||
print("Total count of formats:", humanize.intcomma(count))
|
||||
print("Total size:", hsize(size))
|
||||
|
||||
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire()
|
@ -0,0 +1,64 @@
|
||||
from pathlib import Path
|
||||
from sqlite_utils import Database
|
||||
from sqlite_utils.db import NotFoundError
|
||||
import json
|
||||
|
||||
def init_diff_db(dir="."):
|
||||
|
||||
path = Path(dir) / "diff.db"
|
||||
|
||||
db_diff = Database(path)
|
||||
if not "summary" in db_diff.table_names():
|
||||
db_diff["summary"].create({
|
||||
"uuid": str,
|
||||
"title": str,
|
||||
# "cover": str,
|
||||
# "source": str
|
||||
"authors": str,
|
||||
"year": str,
|
||||
"series": str,
|
||||
"language": str,
|
||||
"links": str,
|
||||
# "desc": str,
|
||||
"publisher": str,
|
||||
"tags": str,
|
||||
"identifiers": str,
|
||||
"formats": str,
|
||||
"status": str,
|
||||
"old_location":str
|
||||
}
|
||||
# )
|
||||
, pk="uuid")
|
||||
|
||||
return db_diff
|
||||
|
||||
def diff(old, new, dir=".", ):
|
||||
path = Path(dir) / old
|
||||
db_old = Database(path)
|
||||
|
||||
path = Path(dir) / new
|
||||
db_new = Database(path)
|
||||
|
||||
path = Path(dir) / "diff.db"
|
||||
db_diff =init_diff_db(dir)
|
||||
|
||||
for i, n_book in enumerate(db_new["summary"].rows):
|
||||
n_uuid = n_book['uuid']
|
||||
print(i, n_uuid)
|
||||
try:
|
||||
o_book = db_old["summary"].get(n_uuid)
|
||||
# print(n_uuid, '=OK')
|
||||
o_loc=json.loads(o_book['title'])['href']
|
||||
n_loc=json.loads(n_book['title'])['href']
|
||||
if o_loc != n_loc :
|
||||
print(n_uuid, 'MOVED')
|
||||
n_book["status"]="MOVED"
|
||||
n_book["old_location"]=o_loc
|
||||
n_book.pop ('cover', None)
|
||||
db_diff["summary"].insert(n_book, pk='uuid')
|
||||
|
||||
except NotFoundError:
|
||||
# print(n_uuid, '=NOK')
|
||||
n_book.pop ('cover', None)
|
||||
n_book["status"]="NEW"
|
||||
db_diff["summary"].insert(n_book, pk='uuid')
|
@ -0,0 +1,237 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from sqlite_utils import Database
|
||||
from humanize import naturalsize as hsize
|
||||
|
||||
from calistat import get_desc_url, get_format_url
|
||||
|
||||
|
||||
def init_index_db(dir="."):
|
||||
|
||||
path = Path(dir) / "index.db"
|
||||
|
||||
db_index = Database(path)
|
||||
if not "summary" in db_index.table_names():
|
||||
db_index["summary"].create({
|
||||
"uuid": str,
|
||||
"cover": str,
|
||||
"title": str,
|
||||
# "source": str
|
||||
"authors": str,
|
||||
"year": str,
|
||||
"series": str,
|
||||
"language": str,
|
||||
"links": str,
|
||||
# "desc": str,
|
||||
"publisher": str,
|
||||
"tags": str,
|
||||
"identifiers": str,
|
||||
"formats": str
|
||||
}
|
||||
# )
|
||||
, pk="uuid")
|
||||
|
||||
# db_index.table("index", pk="uuid")
|
||||
# db_index.table("summary").enable_fts(["title"])
|
||||
# db_index["summary"].enable_fts(["title", "authors", "series", "uuid", "language", "identifiers", "tags", "publisher", "formats", "pubdate"])
|
||||
db_index["summary"].enable_fts(["title", "authors", "series", "language", "identifiers", "tags", "publisher", "formats", "year"])
|
||||
|
||||
return db_index
|
||||
|
||||
|
||||
def get_img_url(db, book):
|
||||
url = json.loads(list(db['site'].rows)[0]["urls"])[0]
|
||||
|
||||
library=book['library']
|
||||
id_=str(book['id'])
|
||||
|
||||
f_urls=[]
|
||||
|
||||
major= list(db['site'].rows)[0]["major"]
|
||||
|
||||
if major >= 3:
|
||||
d_url =url+"/get/thumb/"+id_+"/"+library+ "?sz=600x800"
|
||||
else:
|
||||
# d_url =url+"/get/thumb/"+id_
|
||||
d_url =url+"/get/thumb_90_120/"+id_
|
||||
|
||||
return d_url
|
||||
|
||||
|
||||
def build_index (dir='.', english=True):
|
||||
|
||||
dbs=[]
|
||||
for f in os.listdir(dir):
|
||||
if not f.endswith(".db"):
|
||||
continue
|
||||
if f in ("index.db", "sites.db"):
|
||||
continue
|
||||
p = Path(dir) / f
|
||||
print(f)
|
||||
try:
|
||||
db = Database(p.resolve())
|
||||
except:
|
||||
print ("Pb with:", f)
|
||||
dbs.append(db)
|
||||
|
||||
db_index = init_index_db(dir=dir)
|
||||
index_t=db_index["summary"]
|
||||
|
||||
batch_size=10000
|
||||
count=0
|
||||
summaries=[]
|
||||
|
||||
for db in dbs:
|
||||
for i, ebook in enumerate(db["ebooks"].rows):
|
||||
if english and not ebook['language'] or ebook['language'] != "eng":
|
||||
continue
|
||||
elif not english and ebook['language'] == "eng":
|
||||
continue
|
||||
|
||||
if ebook['authors']:
|
||||
ebook['authors']=formats=json.loads(ebook['authors'])
|
||||
# if ebook['series']:
|
||||
# ebook['series']=formats=json.loads(ebook['series'])
|
||||
if ebook['identifiers']:
|
||||
ebook['identifiers']=formats=json.loads(ebook['identifiers'])
|
||||
if ebook['tags']:
|
||||
ebook['tags']=formats=json.loads(ebook['tags'])
|
||||
ebook['formats']=formats=json.loads(ebook['formats'])
|
||||
ebook['links']=""
|
||||
summary = {k: v for k, v in ebook.items() if k in ("uuid","title", "authors", "series", "language", "formats", "tags", "publisher", "identifiers")}
|
||||
# summary = {k: v for k, v in ebook.items() if k in ("uuid","title", "authors", "series", "identifiers", "language", "tags", "publisher", "formats")}
|
||||
summary['title']={'href': get_desc_url(db, ebook), 'label': ebook['title']}
|
||||
|
||||
summary["cover"]= {"img_src": get_img_url(db, ebook), "width": 90}
|
||||
|
||||
formats=[]
|
||||
for f in ebook['formats']:
|
||||
formats.append({'href': get_format_url(db, ebook, f), 'label': f"{f} ({hsize(ebook[f])})"})
|
||||
summary['links']=formats
|
||||
|
||||
pubdate=ebook['pubdate']
|
||||
summary['year']=pubdate[0:4] if pubdate else ""
|
||||
summaries.append(summary)
|
||||
# print(summary)
|
||||
count+=1
|
||||
print (f"\r{count} - ebook handled: {ebook['uuid']}", end='')
|
||||
|
||||
if not count % batch_size:
|
||||
# print()
|
||||
# print(f"Saving summary by batch: {len(summaries)}")
|
||||
# print(summaries)
|
||||
# index_t.upsert_all(summaries, batch_size=1000, pk='uuid')
|
||||
# index_t.insert_all(summaries, batch_size=1000, pk='uuid')
|
||||
try:
|
||||
index_t.insert_all(summaries, batch_size=batch_size)
|
||||
except Exception as e:
|
||||
# dump = [(s['uuid'],s['links']) for s in summaries]
|
||||
# print(dump)
|
||||
print()
|
||||
print("UUID collisions. Probalbly a site duplicate")
|
||||
print(e)
|
||||
print()
|
||||
|
||||
# index_t.upsert_all(summaries, batch_size=batch_size, pk='uuid')
|
||||
# TODO Some ebooks could be missed. We need to compute the batch list, insert new ebooks and update the site index
|
||||
|
||||
# print("Saved")
|
||||
# print()
|
||||
summaries=[]
|
||||
|
||||
# print()
|
||||
# print("saving summary")
|
||||
# index_t.upsert_all(summaries, batch_size=1000, pk='uuid')
|
||||
# index_t.insert_all(summaries, batch_size=1000, pk='uuid')
|
||||
try:
|
||||
index_t.insert_all(summaries, batch_size=batch_size)
|
||||
except:
|
||||
print("sqlite3.IntegrityError: UNIQUE constraint failed: summary.uuid")
|
||||
|
||||
# print("summary done")
|
||||
# print()
|
||||
|
||||
print()
|
||||
print("fts")
|
||||
index_t.populate_fts(["title", "authors", "series", "identifiers", "language", "tags", "publisher", "formats", "year"])
|
||||
print("fts done")
|
||||
|
||||
|
||||
def search(query_str, dir=".", links_only=False):
|
||||
path = Path(dir) / "index.db"
|
||||
db_index = Database(path)
|
||||
# table=db_index["summary"]
|
||||
# rows=table.search(query_str)
|
||||
# print(rows)
|
||||
sites=set()
|
||||
ebook_ids=[]
|
||||
for ebook in db_index["summary"].search(query_str):
|
||||
sites.add(ebook[-1])
|
||||
ebook_ids.append((ebook[3], ebook[-1]))
|
||||
# print (ebook)
|
||||
# print("sites:", sites)
|
||||
# print("ebooks:", ebook_ids)
|
||||
|
||||
site_dbs={}
|
||||
for s in sites:
|
||||
f_uuid=s+".db"
|
||||
path = Path(dir) / f_uuid
|
||||
site_dbs[s]=Database(path)
|
||||
# print(site_dbs[s].tables)
|
||||
|
||||
for e in ebook_ids:
|
||||
# ebook=site_dbs[e[1]]["ebooks"].get(e[0])
|
||||
# print("ebook:", ebook)
|
||||
db=site_dbs[e[1]]
|
||||
# ebooks=db.conn.execute("select * from ebooks").fetchone()
|
||||
ebook=db.conn.execute(f'select * from ebooks where uuid="{e[0]}"').fetchone()
|
||||
url=json.loads(db['site'].get(1)['urls'])[0]
|
||||
library=db['site'].get(1)['library']
|
||||
formats=json.loads(ebook[14])
|
||||
id_=str(ebook[0])
|
||||
|
||||
if not links_only:
|
||||
print()
|
||||
print("Title:", ebook[2])
|
||||
print("Author:", ebook[3])
|
||||
print("Serie:", ebook[4])
|
||||
print("Formats:", formats)
|
||||
|
||||
for f in formats:
|
||||
print(url+"get/"+f+"/"+id_+"/"+library)
|
||||
|
||||
|
||||
# https://stackoverflow.com/questions/26692284/how-to-prevent-brokenpipeerror-when-doing-a-flush-in-python
|
||||
|
||||
def index_to_json(dir='.'):
|
||||
path = Path(dir) / "index.db"
|
||||
db = Database(path)
|
||||
|
||||
# sys.stdout.flush()
|
||||
|
||||
try:
|
||||
for row in db["summary"].rows:
|
||||
if row['title']:
|
||||
row['title']=json.loads(row['title'])
|
||||
if row['authors']:
|
||||
row['authors']=json.loads(row['authors'])
|
||||
if row['series']:
|
||||
row['series']=json.loads(row['series'])
|
||||
if row['links']:
|
||||
row['links']=json.loads(row['links'])
|
||||
if row['tags']:
|
||||
row['tags']=json.loads(row['tags'])
|
||||
if row['identifiers']:
|
||||
row['identifiers']=json.loads(row['identifiers'])
|
||||
if row['formats']:
|
||||
row['formats']=json.loads(row['formats'])
|
||||
|
||||
json.dump(row, sys.stdout)
|
||||
sys.stdout.flush()
|
||||
# return
|
||||
except BrokenPipeError:
|
||||
devnull = os.open(os.devnull, os.O_WRONLY)
|
||||
os.dup2(devnull, sys.stdout.fileno())
|
||||
sys.exit(1)
|
@ -0,0 +1,206 @@
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from urllib.parse import *
|
||||
import uuid
|
||||
from sqlite_utils import Database
|
||||
import datetime
|
||||
|
||||
|
||||
import gevent
|
||||
from gevent import monkey
|
||||
from gevent import Timeout
|
||||
from gevent.pool import Pool
|
||||
monkey.patch_socket()
|
||||
|
||||
|
||||
def init_sites_db(dir="."):
|
||||
|
||||
path = Path(dir) / "sites.db"
|
||||
|
||||
db = Database(path)
|
||||
if not "sites" in db.table_names():
|
||||
db["sites"].create({
|
||||
"uuid": str,
|
||||
"url": str,
|
||||
"hostnames": str,
|
||||
"ports": str,
|
||||
"country": int,
|
||||
"isp": str,
|
||||
"status": str,
|
||||
"last_online": str,
|
||||
"last_check": str,
|
||||
"error": int,
|
||||
# "schema_version": 1
|
||||
# # TODO: add the most common formats
|
||||
}, pk="uuid")
|
||||
# }, pk="uuid", not_null=True)
|
||||
|
||||
# if not "sites" in db.table_names():
|
||||
# db["sites"].create({
|
||||
# "uuid": str
|
||||
# }, pk="uuid",)
|
||||
|
||||
db.table("sites", pk='uuid', batch_size=100, alter=True)
|
||||
return db
|
||||
|
||||
|
||||
def save_site(db: Database, site):
|
||||
# # TODO: Check if the site is not alreday present
|
||||
# def save_sites(db, sites):
|
||||
# db["sites"].insert_all(sites, alter=True, batch_size=100)
|
||||
if not 'uuid' in site:
|
||||
site['uuid']=str(uuid.uuid4())
|
||||
print(site)
|
||||
db["sites"].upsert(site, pk='uuid')
|
||||
|
||||
|
||||
def check_and_save_site(db, site):
|
||||
res= check_calibre_site(site)
|
||||
print(res)
|
||||
save_site(db, res)
|
||||
|
||||
# import pysnooper
|
||||
# @pysnooper.snoop()
|
||||
def check_calibre_site(site):
|
||||
ret={}
|
||||
ret['uuid']=site["uuid"]
|
||||
now=str(datetime.datetime.now())
|
||||
ret['last_check']=now
|
||||
|
||||
api=site['url']+'/ajax/'
|
||||
timeout=15
|
||||
library=""
|
||||
url=api+'search'+library+'?num=0'
|
||||
print()
|
||||
print("Getting ebooks count:", site['url'])
|
||||
print(url)
|
||||
|
||||
try:
|
||||
r=requests.get(url, verify=False, timeout=(timeout, 30))
|
||||
r.raise_for_status()
|
||||
except requests.exceptions.HTTPError as e:
|
||||
r.status_code
|
||||
ret['error']=r.status_code
|
||||
if (r.status_code == 401):
|
||||
ret['status']="unauthorized"
|
||||
else:
|
||||
ret['status']="down"
|
||||
return ret
|
||||
except requests.RequestException as e:
|
||||
print("Unable to open site:", url)
|
||||
# print (getattr(e, 'message', repr(e)))
|
||||
print (e)
|
||||
ret['status']="down"
|
||||
return ret
|
||||
except Exception as e:
|
||||
print ("Other issue:", e)
|
||||
ret['status']='Unknown Error'
|
||||
print (e)
|
||||
return ret
|
||||
except :
|
||||
print("Wazza !!!!")
|
||||
ret['status']='Critical Error'
|
||||
print (e)
|
||||
return ret
|
||||
|
||||
try:
|
||||
print("Total count=",r.json()["total_num"])
|
||||
except:
|
||||
pass
|
||||
|
||||
status=ret['status']='online'
|
||||
if status=="online":
|
||||
ret['last_online']=now
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
def get_site_uuid_from_url(db, url):
|
||||
|
||||
site=urlparse(url)
|
||||
hostname=site.hostname
|
||||
site=site._replace(path='')
|
||||
|
||||
url=urlunparse(site)
|
||||
# print (url)
|
||||
|
||||
# print (hostname)
|
||||
row=db.conn.execute(f"select * from sites where instr(hostnames, '{hostname}')").fetchone()
|
||||
# print(row)
|
||||
if row:
|
||||
return row
|
||||
|
||||
def map_site_from_url(url):
|
||||
ret={}
|
||||
|
||||
site=urlparse(url)
|
||||
|
||||
print(site)
|
||||
site=site._replace(path='')
|
||||
ret['url']=urlunparse(site)
|
||||
ret['hostnames']=[site.hostname]
|
||||
ret['ports']=[str(site.port)]
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def import_urls_from_file(filepath, dir='.'):
|
||||
|
||||
#TODO skip malformed urls
|
||||
#TODO use cache instead
|
||||
|
||||
db=init_sites_db(dir)
|
||||
|
||||
with open(filepath) as f:
|
||||
for url in f.readlines():
|
||||
url=url.rstrip()
|
||||
# url='http://'+url
|
||||
if get_site_uuid_from_url(db, url):
|
||||
print(f"'{url}'' already present")
|
||||
continue
|
||||
print(f"'{url}'' added")
|
||||
save_site(db, map_site_from_url(url))
|
||||
|
||||
|
||||
|
||||
def get_libs_from_site(site):
|
||||
|
||||
server=site.rstrip('/')
|
||||
api=server+'/ajax/'
|
||||
timeout=30
|
||||
|
||||
print()
|
||||
print("Server:", server)
|
||||
url=api+'library-info'
|
||||
|
||||
print()
|
||||
print("Getting libraries from", server)
|
||||
# print(url)
|
||||
|
||||
try:
|
||||
r=requests.get(url, verify=False, timeout=(timeout, 30))
|
||||
r.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
print("Unable to open site:", url)
|
||||
# return
|
||||
except Exception as e:
|
||||
print ("Other issue:", e)
|
||||
return
|
||||
# pass
|
||||
|
||||
libraries = r.json()["library_map"].keys()
|
||||
print("Libraries:", ", ".join(libraries))
|
||||
return libraries
|
||||
|
||||
def check_calibre_list(dir='.'):
|
||||
db=init_sites_db(dir)
|
||||
sites=[]
|
||||
for row in db["sites"].rows:
|
||||
print(f"Queueing:{row['url']}")
|
||||
sites.append(row)
|
||||
print(sites)
|
||||
pool = Pool(100)
|
||||
pool.map(lambda s: check_and_save_site (db, s), sites)
|
||||
|
||||
# example of a fts search sqlite-utils index.db "select * from summary_fts where summary_fts match 'title:fre*'"
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,29 @@
|
||||
[tool.poetry]
|
||||
name = "calishot"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Your Name <you@example.com>"]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8"
|
||||
sqlite-utils = "^2.8"
|
||||
bs4 = "^0.0.1"
|
||||
gevent = "^20.5.0"
|
||||
datasette-pretty-json = "^0.2"
|
||||
datasette-json-html = "^0.6"
|
||||
datasette-mask-columns = "^0.2"
|
||||
requests = "^2.24.0"
|
||||
humanize = "^2.5.0"
|
||||
langid = "^1.1.6"
|
||||
iso639 = "^0.1.4"
|
||||
unidecode = "^1.1.1"
|
||||
datasette = "^0.50.2"
|
||||
sqlitedict = "^1.7.0"
|
||||
fire = "^0.3.1"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry>=0.12"]
|
||||
build-backend = "poetry.masonry.api"
|
Loading…
Reference in New Issue