Added --duplicate flag to find possible duplicates

This commit is contained in:
Rhet Turnbull 2021-06-12 18:31:53 -07:00
parent 6a0b8b4a3f
commit 83892e096a
37 changed files with 192 additions and 50 deletions

View File

@ -3449,6 +3449,7 @@ For additional details about how osxphotos is implemented or if you would like t
- [Rich](https://github.com/willmcgugan/rich)
- [textx](https://github.com/textX/textX)
- [bitmath](https://github.com/tbielawa/bitmath)
- [more-itertools](https://github.com/more-itertools/more-itertools)
## Acknowledgements

View File

@ -9,12 +9,10 @@ import pathlib
import pprint
import sys
import time
import unicodedata
import bitmath
import click
import osxmetadata
import photoscript
import yaml
import osxphotos
@ -458,6 +456,14 @@ def QUERY_OPTIONS(f):
is_flag=True,
help="Search for photos that are not in any albums.",
),
o(
"--duplicate",
is_flag=True,
help="Search for photos with possible duplicates. osxphotos will compare signatures of photos, "
"evaluating date created, size, height, width, and edited status to find *possible* duplicates. "
"This does not compare images byte-for-byte nor compare hashes but should find photos imported multiple "
"times or duplicated within Photos."
),
o(
"--min-size",
metavar="SIZE",
@ -1067,6 +1073,7 @@ def export(
max_size,
regex,
query_eval,
duplicate,
):
"""Export photos from the Photos database.
Export path DEST is required.
@ -1221,6 +1228,7 @@ def export(
max_size = cfg.max_size
regex = cfg.regex
query_eval = cfg.query_eval
duplicate = cfg.duplicate
# config file might have changed verbose
VERBOSE = bool(verbose)
@ -1526,6 +1534,7 @@ def export(
max_size=max_size,
regex=regex,
query_eval=query_eval,
duplicate=duplicate,
)
try:
@ -1891,6 +1900,7 @@ def query(
is_reference,
in_album,
not_in_album,
duplicate,
min_size,
max_size,
regex,
@ -1926,6 +1936,7 @@ def query(
min_size,
max_size,
regex,
duplicate,
]
exclusive = [
(favorite, not_favorite),
@ -2051,6 +2062,7 @@ def query(
max_size=max_size,
query_eval=query_eval,
regex=regex,
duplicate=duplicate
)
try:

View File

@ -1,7 +1,10 @@
""" PhotosAlbum class to create an album in default Photos library and add photos to it """
from typing import Optional, List
from typing import List, Optional
import photoscript
from more_itertools import chunked
from .photoinfo import PhotoInfo
from .utils import noop
@ -27,7 +30,8 @@ class PhotosAlbum:
def add_list(self, photo_list: List[PhotoInfo]):
photos = [photoscript.Photo(p.uuid) for p in photo_list]
self.album.add(photos)
for photolist in chunked(photos, 10):
self.album.add(photolist)
photo_len = len(photos)
photo_word = "photos" if photo_len > 1 else "photo"
self.verbose(f"Added {photo_len} {photo_word} to album {self.name}")

View File

@ -11,6 +11,7 @@ import platform
import re
import sys
import tempfile
from collections import OrderedDict
from datetime import datetime, timedelta, timezone
from pprint import pformat
from typing import List
@ -65,17 +66,17 @@ class PhotosDB:
"""Processes a Photos.app library database to extract information about photos"""
# import additional methods
from ._photosdb_process_comments import _process_comments
from ._photosdb_process_exif import _process_exifinfo
from ._photosdb_process_faceinfo import _process_faceinfo
from ._photosdb_process_scoreinfo import _process_scoreinfo
from ._photosdb_process_searchinfo import (
_process_searchinfo,
labels,
labels_normalized,
labels_as_dict,
labels_normalized,
labels_normalized_as_dict,
)
from ._photosdb_process_scoreinfo import _process_scoreinfo
from ._photosdb_process_comments import _process_comments
def __init__(self, dbfile=None, verbose=None, exiftool=None):
"""Create a new PhotosDB object.
@ -3225,6 +3226,32 @@ class PhotosDB:
except Exception as e:
raise ValueError(f"Invalid query_eval CRITERIA: {e}")
if options.duplicate:
no_date = datetime(1970, 1, 1)
tz = timezone(timedelta(0))
no_date = no_date.astimezone(tz=tz)
photos = sorted(
[p for p in photos if p.duplicates],
key=lambda x: x.date_added or no_date,
)
# gather all duplicates but ensure each uuid is only represented once
photodict = OrderedDict()
for p in photos:
if p.uuid not in photodict:
photodict[p.uuid] = p
for d in sorted(
p.duplicates, key=lambda x: x.date_added or no_date
):
if d.uuid not in photodict:
photodict[d.uuid] = d
photos = list(photodict.values())
# filter for deleted as photo.duplicates will include photos in the trash
if not (options.deleted or options.deleted_only):
photos = [p for p in photos if not p.intrash]
if options.deleted_only:
photos = [p for p in photos if p.intrash]
return photos
def _duplicate_signature(self, uuid):

View File

@ -1,6 +1,6 @@
""" QueryOptions class for PhotosDB.query """
from dataclasses import dataclass
from dataclasses import dataclass, asdict
from typing import Optional, Iterable, Tuple
import datetime
import bitmath
@ -30,7 +30,7 @@ class QueryOptions:
shared: Optional[bool] = None
not_shared: Optional[bool] = None
photos: Optional[bool] = True
movies: Optional[bool] = True
movies: Optional[bool] = True
uti: Optional[Iterable[str]] = None
burst: Optional[bool] = None
not_burst: Optional[bool] = None
@ -78,6 +78,7 @@ class QueryOptions:
max_size: Optional[bitmath.Byte] = None
regex: Optional[Iterable[Tuple[str, str]]] = None
query_eval: Optional[Iterable[str]] = None
duplicate: Optional[bool] = None
def asdict(self):
return asdict(self)

View File

@ -19,3 +19,4 @@ osxmetadata==0.99.14
textx==2.3.0
rich==10.2.2
bitmath==1.3.3.1
more-itertools==8.8.0

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 144 KiB

View File

@ -23,8 +23,8 @@ PHOTOS_DB = "tests/Test-10.15.7.photoslibrary/database/photos.db"
PHOTOS_DB_PATH = "/Test-10.15.7.photoslibrary/database/photos.db"
PHOTOS_LIBRARY_PATH = "/Test-10.15.7.photoslibrary"
PHOTOS_DB_LEN = 20
PHOTOS_NOT_IN_TRASH_LEN = 18
PHOTOS_DB_LEN = 21
PHOTOS_NOT_IN_TRASH_LEN = 19
PHOTOS_IN_TRASH_LEN = 2
PHOTOS_DB_IMPORT_SESSIONS = 15
@ -72,10 +72,10 @@ KEYWORDS_DICT = {
"foo/bar": 1,
"Travel": 2,
"Maria": 1,
"Drink": 1,
"Val d'Isère": 1,
"Wine": 1,
"Wine Bottle": 1,
"Drink": 2,
"Val d'Isère": 2,
"Wine": 2,
"Wine Bottle": 2,
}
PERSONS_DICT = {"Katie": 3, "Suzy": 2, "Maria": 2, _UNKNOWN_PERSON: 1}
ALBUM_DICT = {
@ -1063,7 +1063,7 @@ def test_from_to_date(photosdb):
time.tzset()
photos = photosdb.photos(from_date=datetime.datetime(2018, 10, 28))
assert len(photos) == 11
assert len(photos) == 12
photos = photosdb.photos(to_date=datetime.datetime(2018, 10, 28))
assert len(photos) == 7

View File

@ -90,6 +90,34 @@ CLI_EXPORT_FILENAMES = [
"Tulips_edited.jpeg",
"screenshot-really-a-png.jpeg",
"winebottle.jpeg",
"winebottle (1).jpeg",
]
CLI_EXPORT_FILENAMES_DRY_RUN = [
"Pumkins1.jpg",
"Pumkins2.jpg",
"Pumpkins3.jpg",
"St James Park.jpg",
"St James Park_edited.jpeg",
"Tulips.jpg",
"wedding.jpg",
"wedding_edited.jpeg",
"DSC03584.dng",
"IMG_1693.tif",
"IMG_1994.JPG",
"IMG_1994.cr2",
"IMG_1997.JPG",
"IMG_1997.cr2",
"IMG_3092.heic",
"IMG_3092_edited.jpeg",
"IMG_4547.jpg",
"Jellyfish.MOV",
"Jellyfish1.mp4",
"Tulips_edited.jpeg",
"screenshot-really-a-png.jpeg",
"winebottle.jpeg",
"winebottle.jpeg",
]
CLI_EXPORT_IGNORE_SIGNATURE_FILENAMES = ["Tulips.jpg", "wedding.jpg"]
@ -128,6 +156,7 @@ CLI_EXPORT_FILENAMES_EDITED_SUFFIX = [
"Tulips_bearbeiten.jpeg",
"screenshot-really-a-png.jpeg",
"winebottle.jpeg",
"winebottle (1).jpeg",
]
CLI_EXPORT_FILENAMES_EDITED_SUFFIX_TEMPLATE = [
@ -153,6 +182,7 @@ CLI_EXPORT_FILENAMES_EDITED_SUFFIX_TEMPLATE = [
"Tulips_edited.jpeg",
"screenshot-really-a-png.jpeg",
"winebottle.jpeg",
"winebottle (1).jpeg",
]
CLI_EXPORT_FILENAMES_ORIGINAL_SUFFIX = [
@ -178,6 +208,7 @@ CLI_EXPORT_FILENAMES_ORIGINAL_SUFFIX = [
"Tulips_edited.jpeg",
"screenshot-really-a-png_original.jpeg",
"winebottle_original.jpeg",
"winebottle_original (1).jpeg",
]
CLI_EXPORT_FILENAMES_ORIGINAL_SUFFIX_TEMPLATE = [
@ -203,6 +234,7 @@ CLI_EXPORT_FILENAMES_ORIGINAL_SUFFIX_TEMPLATE = [
"Jellyfish1.mp4",
"screenshot-really-a-png.jpeg",
"winebottle.jpeg",
"winebottle (1).jpeg",
]
CLI_EXPORT_FILENAMES_CURRENT = [
@ -228,6 +260,7 @@ CLI_EXPORT_FILENAMES_CURRENT = [
"8846E3E6-8AC8-4857-8448-E3D025784410.tiff",
"D1359D09-1373-4F3B-B0E3-1A4DE573E4A3.mp4",
"E2078879-A29C-4D6F-BACB-E3BBE6C3EB91.jpeg",
"52083079-73D5-4921-AC1B-FE76F279133F.jpeg",
]
CLI_EXPORT_FILENAMES_CONVERT_TO_JPEG = [
@ -253,6 +286,7 @@ CLI_EXPORT_FILENAMES_CONVERT_TO_JPEG = [
"Jellyfish1.mp4",
"screenshot-really-a-png.jpeg",
"winebottle.jpeg",
"winebottle (1).jpeg",
]
CLI_EXPORT_FILENAMES_CONVERT_TO_JPEG_SKIP_RAW = [
@ -276,6 +310,7 @@ CLI_EXPORT_FILENAMES_CONVERT_TO_JPEG_SKIP_RAW = [
"Jellyfish1.mp4",
"screenshot-really-a-png.jpeg",
"winebottle.jpeg",
"winebottle (1).jpeg",
]
CLI_EXPORT_CONVERT_TO_JPEG_LARGE_FILE = "DSC03584.jpeg"
@ -445,9 +480,10 @@ PHOTOS_NOT_IN_TRASH_LEN_14_6 = 12
PHOTOS_IN_TRASH_LEN_14_6 = 1
PHOTOS_MISSING_14_6 = 1
PHOTOS_NOT_IN_TRASH_LEN_15_7 = 18
PHOTOS_NOT_IN_TRASH_LEN_15_7 = 19
PHOTOS_IN_TRASH_LEN_15_7 = 2
PHOTOS_MISSING_15_7 = 2
PHOTOS_EDITED_15_7 = 4
CLI_PLACES_JSON = """{"places": {"_UNKNOWN_": 1, "Maui, Wailea, Hawai'i, United States": 1, "Washington, District of Columbia, United States": 1}}"""
@ -608,14 +644,27 @@ KEYWORDS_JSON = {
"flowers": 1,
"foo/bar": 1,
"Maria": 1,
"Wine": 1,
"Val d'Isère": 1,
"Drink": 1,
"Wine Bottle": 1,
"Wine": 2,
"Val d'Isère": 2,
"Drink": 2,
"Wine Bottle": 2,
}
}
ALBUMS_JSON = {"albums": {"Raw": 4, "Pumpkin Farm": 3, "Test Album": 2, "AlbumInFolder": 2, "Multi Keyword": 2, "I have a deleted twin": 1, "2018-10 - Sponsion, Museum, Frühstück, Römermuseum": 1, "2019-10/11 Paris Clermont": 1, "EmptyAlbum": 0}, "shared albums": {}}
ALBUMS_JSON = {
"albums": {
"Raw": 4,
"Pumpkin Farm": 3,
"Test Album": 2,
"AlbumInFolder": 2,
"Multi Keyword": 2,
"I have a deleted twin": 1,
"2018-10 - Sponsion, Museum, Frühstück, Römermuseum": 1,
"2019-10/11 Paris Clermont": 1,
"EmptyAlbum": 0,
},
"shared albums": {},
}
ALBUMS_STR = """albums:
Raw: 4
@ -711,6 +760,12 @@ UUID_NOT_IN_ALBUM = [
"35329C57-B963-48D6-BB75-6AFF9370CBBC",
"8846E3E6-8AC8-4857-8448-E3D025784410",
"7F74DD34-5920-4DA3-B284-479887A34F66",
"52083079-73D5-4921-AC1B-FE76F279133F",
]
UUID_DUPLICATES = [
"7F74DD34-5920-4DA3-B284-479887A34F66",
"52083079-73D5-4921-AC1B-FE76F279133F",
]
@ -1057,6 +1112,27 @@ def test_query_not_in_album():
assert sorted(uuid_got) == sorted(UUID_NOT_IN_ALBUM)
def test_query_duplicate():
"""Test query with --duplicate"""
import json
import os
import os.path
from osxphotos.cli import query
runner = CliRunner()
cwd = os.getcwd()
result = runner.invoke(
query,
["--json", "--db", os.path.join(cwd, CLI_PHOTOS_DB), "--duplicate"],
)
assert result.exit_code == 0
# build list of uuids we got from the output JSON
json_got = json.loads(result.output)
uuid_got = [photo["uuid"] for photo in json_got]
assert sorted(uuid_got) == sorted(UUID_DUPLICATES)
def test_export():
import glob
import os
@ -1848,6 +1924,26 @@ def test_export_convert_to_jpeg_skip_raw():
assert sorted(files) == sorted(CLI_EXPORT_FILENAMES_CONVERT_TO_JPEG_SKIP_RAW)
def test_export_duplicate():
"""Test export with --duplicate"""
import glob
import os
import os.path
import osxphotos
from osxphotos.cli import export
runner = CliRunner()
cwd = os.getcwd()
# pylint: disable=not-context-manager
with runner.isolated_filesystem():
result = runner.invoke(
export, [os.path.join(cwd, CLI_PHOTOS_DB), ".", "-V", "--duplicate"]
)
assert result.exit_code == 0
files = glob.glob("*")
assert len(files) == len(UUID_DUPLICATES)
def test_query_date_1():
"""Test --from-date and --to-date"""
import json
@ -3842,7 +3938,7 @@ def test_export_update_basic():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 0, updated: 0, skipped: 22, updated EXIF data: 0, missing: 2, error: 0"
"Processed: 19 photos, exported: 0, updated: 0, skipped: 23, updated EXIF data: 0, missing: 2, error: 0"
in result.output
)
@ -3926,7 +4022,7 @@ def test_export_update_exiftool():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 0, updated: 22, skipped: 0, updated EXIF data: 22, missing: 2, error: 1"
"Processed: 19 photos, exported: 0, updated: 23, skipped: 0, updated EXIF data: 23, missing: 2, error: 1"
in result.output
)
@ -3936,7 +4032,7 @@ def test_export_update_exiftool():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 0, updated: 0, skipped: 22, updated EXIF data: 0, missing: 2, error: 0"
"Processed: 19 photos, exported: 0, updated: 0, skipped: 23, updated EXIF data: 0, missing: 2, error: 0"
in result.output
)
@ -3973,7 +4069,7 @@ def test_export_update_hardlink():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 0, updated: 22, skipped: 0, updated EXIF data: 0, missing: 2, error: 0"
"Processed: 19 photos, exported: 0, updated: 23, skipped: 0, updated EXIF data: 0, missing: 2, error: 0"
in result.output
)
assert not os.path.samefile(CLI_EXPORT_UUID_FILENAME, photo.path)
@ -4012,7 +4108,7 @@ def test_export_update_hardlink_exiftool():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 0, updated: 22, skipped: 0, updated EXIF data: 22, missing: 2, error: 1"
"Processed: 19 photos, exported: 0, updated: 23, skipped: 0, updated EXIF data: 23, missing: 2, error: 1"
in result.output
)
assert not os.path.samefile(CLI_EXPORT_UUID_FILENAME, photo.path)
@ -4050,7 +4146,7 @@ def test_export_update_edits():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 1, updated: 1, skipped: 20, updated EXIF data: 0, missing: 2, error: 0"
f"Processed: {PHOTOS_NOT_IN_TRASH_LEN_15_7} photos, exported: 1, updated: 1, skipped: {PHOTOS_NOT_IN_TRASH_LEN_15_7+PHOTOS_EDITED_15_7-2}, updated EXIF data: 0, missing: 2, error: 0"
in result.output
)
@ -4148,7 +4244,7 @@ def test_export_update_no_db():
# edited files will be re-exported because there won't be an edited signature
# in the database
assert (
"Processed: 18 photos, exported: 0, updated: 4, skipped: 18, updated EXIF data: 0, missing: 2, error: 0"
"Processed: 19 photos, exported: 0, updated: 4, skipped: 19, updated EXIF data: 0, missing: 2, error: 0"
in result.output
)
assert os.path.isfile(OSXPHOTOS_EXPORT_DB)
@ -4188,7 +4284,7 @@ def test_export_then_hardlink():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 22, missing: 2, error: 0" in result.output
"Processed: 19 photos, exported: 23, missing: 2, error: 0" in result.output
)
assert os.path.samefile(CLI_EXPORT_UUID_FILENAME, photo.path)
@ -4210,9 +4306,9 @@ def test_export_dry_run():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 22, missing: 2, error: 0" in result.output
"Processed: 19 photos, exported: 23, missing: 2, error: 0" in result.output
)
for filepath in CLI_EXPORT_FILENAMES:
for filepath in CLI_EXPORT_FILENAMES_DRY_RUN:
assert re.search(r"Exported.*" + f"{filepath}", result.output)
assert not os.path.isfile(filepath)
@ -4255,7 +4351,7 @@ def test_export_update_edits_dry_run():
)
assert result.exit_code == 0
assert (
"Processed: 18 photos, exported: 1, updated: 1, skipped: 20, updated EXIF data: 0, missing: 2, error: 0"
f"Processed: {PHOTOS_NOT_IN_TRASH_LEN_15_7} photos, exported: 1, updated: 1, skipped: {PHOTOS_NOT_IN_TRASH_LEN_15_7+PHOTOS_EDITED_15_7-2}, updated EXIF data: 0, missing: 2, error: 0"
in result.output
)
@ -4290,7 +4386,7 @@ def test_export_directory_template_1_dry_run():
],
)
assert result.exit_code == 0
assert "exported: 22" in result.output
assert "exported: 23" in result.output
workdir = os.getcwd()
for filepath in CLI_EXPORTED_DIRECTORY_TEMPLATE_FILENAMES1:
assert re.search(r"Exported.*" + f"{filepath}", result.output)
@ -4326,8 +4422,8 @@ def test_export_touch_files():
)
assert result.exit_code == 0
assert "exported: 22" in result.output
assert "touched date: 20" in result.output
assert "exported: 23" in result.output
assert "touched date: 21" in result.output
for fname, mtime in zip(CLI_EXPORT_BY_DATE, CLI_EXPORT_BY_DATE_TOUCH_TIMES):
st = os.stat(fname)
@ -4359,7 +4455,7 @@ def test_export_touch_files_update():
)
assert result.exit_code == 0
assert "exported: 22" in result.output
assert "exported: 23" in result.output
assert not pathlib.Path(CLI_EXPORT_BY_DATE[0]).is_file()
@ -4369,7 +4465,7 @@ def test_export_touch_files_update():
)
assert result.exit_code == 0
assert "exported: 22" in result.output
assert "exported: 23" in result.output
assert pathlib.Path(CLI_EXPORT_BY_DATE[0]).is_file()
@ -4380,7 +4476,7 @@ def test_export_touch_files_update():
)
assert result.exit_code == 0
assert "skipped: 22" in result.output
assert "skipped: 23" in result.output
# --update --touch-file --dry-run
result = runner.invoke(
@ -4395,8 +4491,8 @@ def test_export_touch_files_update():
],
)
assert result.exit_code == 0
assert "skipped: 22" in result.output
assert "touched date: 20" in result.output
assert "skipped: 23" in result.output
assert "touched date: 21" in result.output
for fname, mtime in zip(
CLI_EXPORT_BY_DATE_NEED_TOUCH, CLI_EXPORT_BY_DATE_NEED_TOUCH_TIMES
@ -4416,8 +4512,8 @@ def test_export_touch_files_update():
],
)
assert result.exit_code == 0
assert "skipped: 22" in result.output
assert "touched date: 20" in result.output
assert "skipped: 23" in result.output
assert "touched date: 21" in result.output
for fname, mtime in zip(
CLI_EXPORT_BY_DATE_NEED_TOUCH, CLI_EXPORT_BY_DATE_NEED_TOUCH_TIMES
@ -4440,7 +4536,7 @@ def test_export_touch_files_update():
],
)
assert result.exit_code == 0
assert "updated: 1, skipped: 21" in result.output
assert "updated: 1, skipped: 22" in result.output
assert "touched date: 1" in result.output
for fname, mtime in zip(CLI_EXPORT_BY_DATE, CLI_EXPORT_BY_DATE_TOUCH_TIMES):
@ -4454,7 +4550,7 @@ def test_export_touch_files_update():
)
assert result.exit_code == 0
assert "skipped: 22" in result.output
assert "skipped: 23" in result.output
@pytest.mark.skip("TODO: This fails on some machines but not all")
@ -4505,7 +4601,7 @@ def test_export_touch_files_exiftool_update():
)
assert result.exit_code == 0
assert "skipped: 18" in result.output
assert "skipped: 19" in result.output
# --update --exiftool --dry-run
result = runner.invoke(
@ -4553,7 +4649,7 @@ def test_export_touch_files_exiftool_update():
],
)
assert result.exit_code == 0
assert "skipped: 18" in result.output
assert "skipped: 19" in result.output
assert "touched date: 18" in result.output
# --update --touch-file --exiftool
@ -4569,7 +4665,7 @@ def test_export_touch_files_exiftool_update():
],
)
assert result.exit_code == 0
assert "skipped: 18" in result.output
assert "skipped: 19" in result.output
assert "touched date: 18" in result.output
for fname, mtime in zip(CLI_EXPORT_BY_DATE, CLI_EXPORT_BY_DATE_TOUCH_TIMES):
@ -4615,7 +4711,7 @@ def test_export_touch_files_exiftool_update():
)
assert result.exit_code == 0
assert "exported: 0" in result.output
assert "skipped: 18" in result.output
assert "skipped: 19" in result.output
# run update without --touch-file
result = runner.invoke(
@ -4631,7 +4727,7 @@ def test_export_touch_files_exiftool_update():
assert result.exit_code == 0
assert "exported: 0" in result.output
assert "skipped: 18" in result.output
assert "skipped: 19" in result.output
def test_export_ignore_signature():