Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sources.browser: switch to use HPI + seanbreckenridge/browserexport #375

Merged
merged 4 commits into from
Feb 10, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
source.browser: implement fallbacks onto old browser module
- if my.browser.export is available try to hack HPI config and use it
- if not, fallback to promnesia.sources.browser_old
karlicoss committed Feb 10, 2023
commit 57cc7790e0c2f8b05025ab2a4ad93acdb556e0ea
8 changes: 8 additions & 0 deletions src/promnesia/common.py
Original file line number Diff line number Diff line change
@@ -586,3 +586,11 @@ def measure(tag: str='', *, logger, unit: str='ms'):
mult = {'s': 1, 'ms': 10**3, 'us': 10**6}[unit]
xx = secs * mult
logger.debug(f'[{tag}]: {xx:.1f}{unit} elapsed')


def is_sqlite_db(x: Path) -> bool:
return x.is_file() and mime(x) in {
'application/x-sqlite3',
'application/vnd.sqlite3',
# TODO this mime can also match wal files/journals, not sure
}
84 changes: 84 additions & 0 deletions src/promnesia/sources/browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import re
from typing import Optional, Iterator, Any, TYPE_CHECKING
import warnings

from promnesia.common import Results, Visit, Loc, Second, PathIsh, logger, is_sqlite_db


def index(p: Optional[PathIsh]=None) -> Results:
from . import hpi

if p is None:
from my.browser.all import history
yield from _index_new(history())
return

warnings.warn('Passing paths to promnesia.sources.browser is deprecated. You should switch to HPI for that. See https://github.com/seanbreckenridge/browserexport#hpi')

# even if the used doesn't have HPI config for my.browser set up,
try:
yield from _index_new_with_adhoc_config(path=p)
except Exception as e:
logger.exception(e)
warnings.warn("Setting my.config.browser.export didn't work. You probably need to update HPI.")
else:
return

logger.warning("Falling back onto legacy promnesia.sources.browser_old")
raise RuntimeError
yield from _index_old(path=p)


def _index_old(*, path: PathIsh) -> Results:
from . import browser_old
yield from browser_old.index(path)


def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
## previously, it was possible to index be called with multiple different db search paths
## this would result in each subsequent call to my.browser.export.history to invalidate cache every time
## so we hack cachew path so it's different for each call
from my.core.core_config import config as hpi_core_config
hpi_cache_dir = hpi_core_config.get_cache_dir()
sanitized_path = re.sub(r'\W', '_', str(path))
cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path
##

from my.core.common import classproperty, Paths, get_files
class config:
class core:
cache_dir = cache_override

class browser:
class export:
@classproperty
def export_path(cls) -> Paths:
return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)])


from my.core.cfg import tmp_config
with tmp_config(modules='my.browser.export|my.core.core_config', config=config):
from my.browser.export import history
yield from _index_new(history())


if TYPE_CHECKING:
from browserexport.merge import Visit as BrowserMergeVisit
else:
BrowserMergeVisit = Any


def _index_new(history: Iterator[BrowserMergeVisit]) -> Results:
for v in history:
desc: Optional[str] = None
duration: Optional[Second] = None
metadata = v.metadata
if metadata is not None:
desc = metadata.title
duration = metadata.duration
yield Visit(
url=v.url,
dt=v.dt,
locator=Loc(title=desc or v.url, href=v.url),
duration=duration,
)
22 changes: 0 additions & 22 deletions src/promnesia/sources/browser_new.py

This file was deleted.

14 changes: 2 additions & 12 deletions src/promnesia/sources/browser_old.py
Original file line number Diff line number Diff line change
@@ -6,31 +6,21 @@

import pytz

from ..common import PathIsh, Results, Visit, Loc, get_logger, Second, mime
from ..common import PathIsh, Results, Visit, Loc, logger, Second, is_sqlite_db
from .. import config

# todo mcachew?
from cachew import cachew

logger = get_logger()


def index(p: PathIsh) -> Results:
pp = Path(p)
assert pp.exists(), pp # just in case of broken symlinks

# is_file check because it also returns dirs
# TODO hmm, not sure what I meant here -- which dirs? behind symlinks?
is_db = lambda x: x.is_file() and mime(x) in {
'application/x-sqlite3',
'application/vnd.sqlite3',
# TODO this mime can also match wal files/journals, not sure
}

# todo warn if filtered out too many?
# todo wonder how quickly mimes can be computed?
# todo ugh, dunno, maybe this really belongs to hpi?? need get_files etc...
dbs = [p for p in sorted(pp.rglob('*')) if is_db(p)]
dbs = [p for p in sorted(pp.rglob('*')) if is_sqlite_db(p)]

assert len(dbs) > 0, pp
logger.info('processing %d databases', len(dbs))
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -72,6 +72,7 @@ commands =
hpi module install my.reddit
hpi module install my.fbmessenger
hpi module install my.google.takeout.parser
hpi module install my.browser.export

{envpython} -m mypy --install-types --non-interactive \
-p promnesia.sources \