Normalize unicode for issue #208

This commit is contained in:
Rhet Turnbull
2020-08-31 05:24:54 -07:00
parent c9b15186a0
commit a36eb416b1
19 changed files with 99 additions and 33 deletions

View File

@@ -10,6 +10,7 @@ import pathlib
import pprint
import sys
import time
import unicodedata
import click
import yaml
@@ -22,7 +23,7 @@ from pathvalidate import (
import osxphotos
from ._constants import _EXIF_TOOL_URL, _PHOTOS_4_VERSION, _UNKNOWN_PLACE
from ._constants import _EXIF_TOOL_URL, _PHOTOS_4_VERSION, _UNKNOWN_PLACE, UNICODE_FORMAT
from ._export_db import ExportDB, ExportDBInMemory
from ._version import __version__
from .datetime_formatter import DateTimeFormatter
@@ -40,9 +41,21 @@ OSXPHOTOS_EXPORT_DB = ".osxphotos_export.db"
def verbose(*args, **kwargs):
""" print output if verbose flag set """
if VERBOSE:
click.echo(*args, **kwargs)
def normalize_unicode(value):
""" normalize unicode data """
if value is not None:
if isinstance(value, tuple):
return tuple(unicodedata.normalize(UNICODE_FORMAT, v) for v in value)
elif isinstance(value, str):
return unicodedata.normalize(UNICODE_FORMAT, value)
else:
return value
else:
return None
def get_photos_db(*db_options):
""" Return path to photos db, select first non-None db_options
@@ -1863,6 +1876,15 @@ def _query(
to_date=to_date,
)
person = normalize_unicode(person)
keyword = normalize_unicode(keyword)
album = normalize_unicode(album)
folder = normalize_unicode(folder)
title = normalize_unicode(title)
description = normalize_unicode(description)
place = normalize_unicode(place)
label = normalize_unicode(label)
if album:
photos = get_photos_by_attribute(photos, "albums", album, ignore_case)

View File

@@ -9,6 +9,9 @@ from datetime import datetime
# Apple Epoch is Jan 1, 2001
TIME_DELTA = (datetime(2001, 1, 1, 0, 0) - datetime(1970, 1, 1, 0, 0)).total_seconds()
# Unicode format to use for comparing strings
UNICODE_FORMAT = "NFC"
# which Photos library database versions have been tested
# Photos 2.0 (10.12.6) == 2622
# Photos 3.0 (10.13.6) == 3301

View File

@@ -1,3 +1,3 @@
""" version info """
__version__ = "0.33.6"
__version__ = "0.33.7"

View File

@@ -4,7 +4,7 @@
import logging
from .._constants import _DB_TABLE_NAMES, _PHOTOS_4_VERSION
from ..utils import _open_sql_file
from ..utils import _open_sql_file, normalize_unicode
from .photosdb_utils import get_db_version
@@ -121,7 +121,7 @@ def _process_faceinfo_4(photosdb):
face["asset_uuid"] = asset_uuid
face["uuid"] = row[2]
face["person"] = person_id
face["fullname"] = row[3]
face["fullname"] = normalize_unicode(row[3])
face["sourcewidth"] = row[7]
face["sourceheight"] = row[8]
face["centerx"] = row[9]
@@ -282,7 +282,7 @@ def _process_faceinfo_5(photosdb):
face["asset_uuid"] = asset_uuid
face["uuid"] = row[2]
face["person"] = person_pk
face["fullname"] = row[4]
face["fullname"] = normalize_unicode(row[4])
face["agetype"] = row[5]
face["baldtype"] = row[6]
face["eyemakeuptype"] = row[7]

View File

@@ -10,7 +10,7 @@ import uuid as uuidlib
from pprint import pformat
from .._constants import _PHOTOS_4_VERSION, SEARCH_CATEGORY_LABEL
from ..utils import _db_is_locked, _debug, _open_sql_file
from ..utils import _db_is_locked, _debug, _open_sql_file, normalize_unicode
"""
This module should be imported in the class defintion of PhotosDB in photosdb.py
@@ -112,8 +112,8 @@ def _process_searchinfo(self):
record["groupid"] = row[3]
record["category"] = row[4]
record["owning_groupid"] = row[5]
record["content_string"] = row[6].replace("\x00", "")
record["normalized_string"] = row[7].replace("\x00", "")
record["content_string"] = normalize_unicode(row[6].replace("\x00", ""))
record["normalized_string"] = normalize_unicode(row[7].replace("\x00", ""))
record["lookup_identifier"] = row[8]
try:
@@ -147,9 +147,10 @@ def _process_searchinfo(self):
"_db_searchinfo_labels_normalized: \n"
+ pformat(self._db_searchinfo_labels_normalized)
)
conn.close()
@property
def labels(self):
""" return list of all search info labels found in the library """

View File

@@ -44,6 +44,7 @@ from ..utils import (
_get_os_version,
_open_sql_file,
get_last_library_path,
normalize_unicode,
)
from .photosdb_utils import get_db_model_version, get_db_version
@@ -713,7 +714,7 @@ class PhotosDB:
for album in c:
self._dbalbum_details[album[0]] = {
"_uuid": album[0],
"title": album[1],
"title": normalize_unicode(album[1]),
"cloudlibrarystate": album[2],
"cloudidentifier": album[3],
"intrash": False if album[4] == 0 else True,
@@ -760,7 +761,7 @@ class PhotosDB:
self._dbfolder_details[uuid] = {
"_uuid": row[0],
"modelId": row[1],
"name": row[2],
"name": normalize_unicode(row[2]),
"isMagic": row[3],
"intrash": row[4],
"folderType": row[5],
@@ -963,7 +964,7 @@ class PhotosDB:
self._dbphotos[uuid]["volumeId"] = row[10]
self._dbphotos[uuid]["imagePath"] = row[11]
self._dbphotos[uuid]["extendedDescription"] = row[12]
self._dbphotos[uuid]["name"] = row[13]
self._dbphotos[uuid]["name"] = normalize_unicode(row[13])
self._dbphotos[uuid]["isMissing"] = row[14]
self._dbphotos[uuid]["originalFilename"] = row[15]
self._dbphotos[uuid]["favorite"] = row[16]
@@ -1608,7 +1609,7 @@ class PhotosDB:
for album in c:
self._dbalbum_details[album[0]] = {
"_uuid": album[0],
"title": album[1],
"title": normalize_unicode(album[1]),
"cloudlocalstate": album[2],
"cloudownerfirstname": album[3],
"cloudownderlastname": album[4],
@@ -1683,12 +1684,13 @@ class PhotosDB:
JOIN ZKEYWORD ON ZKEYWORD.Z_PK = {keyword_join} """
)
for keyword in c:
keyword_title = normalize_unicode(keyword[0])
if not keyword[1] in self._dbkeywords_uuid:
self._dbkeywords_uuid[keyword[1]] = []
if not keyword[0] in self._dbkeywords_keyword:
self._dbkeywords_keyword[keyword[0]] = []
if not keyword_title in self._dbkeywords_keyword:
self._dbkeywords_keyword[keyword_title] = []
self._dbkeywords_uuid[keyword[1]].append(keyword[0])
self._dbkeywords_keyword[keyword[0]].append(keyword[1])
self._dbkeywords_keyword[keyword_title].append(keyword[1])
if _debug():
logging.debug(f"Finished walking through keywords")
@@ -1795,7 +1797,7 @@ class PhotosDB:
info["modelID"] = None
info["masterUuid"] = None
info["masterFingerprint"] = row[1]
info["name"] = row[2]
info["name"] = normalize_unicode(row[2])
# There are sometimes negative values for lastmodifieddate in the database
# I don't know what these mean but they will raise exception in datetime if
@@ -2027,7 +2029,7 @@ class PhotosDB:
for row in c:
uuid = row[0]
if uuid in self._dbphotos:
self._dbphotos[uuid]["extendedDescription"] = row[1]
self._dbphotos[uuid]["extendedDescription"] = normalize_unicode(row[1])
else:
if _debug():
logging.debug(

View File

@@ -11,6 +11,9 @@ from collections import namedtuple # pylint: disable=syntax-error
import yaml
from bpylist import archiver
from ._constants import UNICODE_FORMAT
from .utils import normalize_unicode
# postal address information, returned by PlaceInfo.address
PostalAddress = namedtuple(
"PostalAddress",
@@ -76,12 +79,12 @@ class PLRevGeoLocationInfo:
geoServiceProvider,
postalAddress,
):
self.addressString = addressString
self.addressString = normalize_unicode(addressString)
self.countryCode = countryCode
self.mapItem = mapItem
self.isHome = isHome
self.compoundNames = compoundNames
self.compoundSecondaryNames = compoundSecondaryNames
self.compoundNames = normalize_unicode(compoundNames)
self.compoundSecondaryNames = normalize_unicode(compoundSecondaryNames)
self.version = version
self.geoServiceProvider = geoServiceProvider
self.postalAddress = postalAddress
@@ -183,7 +186,7 @@ class PLRevGeoMapItemAdditionalPlaceInfo:
def __init__(self, area, name, placeType, dominantOrderType):
self.area = area
self.name = name
self.name = normalize_unicode(name)
self.placeType = placeType
self.dominantOrderType = dominantOrderType
@@ -232,13 +235,13 @@ class CNPostalAddress:
_subLocality,
):
self._ISOCountryCode = _ISOCountryCode
self._city = _city
self._country = _country
self._postalCode = _postalCode
self._state = _state
self._street = _street
self._subAdministrativeArea = _subAdministrativeArea
self._subLocality = _subLocality
self._city = normalize_unicode(_city)
self._country = normalize_unicode(_country)
self._postalCode = normalize_unicode(_postalCode)
self._state = normalize_unicode(_state)
self._street = normalize_unicode(_street)
self._subAdministrativeArea = normalize_unicode(_subAdministrativeArea)
self._subLocality = normalize_unicode(_subLocality)
def __eq__(self, other):
return all(
@@ -414,9 +417,9 @@ class PlaceInfo4(PlaceInfo):
# 2: type
# 3: area
try:
places_dict[p[2]].append((p[1], p[3]))
places_dict[p[2]].append((normalize_unicode(p[1]), p[3]))
except KeyError:
places_dict[p[2]] = [(p[1], p[3])]
places_dict[p[2]] = [(normalize_unicode(p[1]), p[3])]
# build list to populate PlaceNames tuple
# initialize with empty lists for each field in PlaceNames

View File

@@ -10,6 +10,7 @@ import sqlite3
import subprocess
import sys
import tempfile
import unicodedata
import urllib.parse
from plistlib import load as plistload
@@ -18,6 +19,7 @@ import CoreServices
import objc
from Foundation import *
from ._constants import UNICODE_FORMAT
from .fileutil import FileUtil
_DEBUG = False
@@ -352,3 +354,13 @@ def _db_is_locked(dbname):
# attr = xattr.xattr(filepath)
# uuid_bytes = bytes(uuid, 'utf-8')
# attr.set(OSXPHOTOS_XATTR_UUID, uuid_bytes)
def normalize_unicode(value):
""" normalize unicode data """
if value is not None:
if not isinstance(value, str):
raise ValueError("value must be str")
return unicodedata.normalize(UNICODE_FORMAT, value)
else:
return None

View File

@@ -7,7 +7,7 @@
<key>hostuuid</key>
<string>9575E48B-8D5F-5654-ABAC-4431B1167324</string>
<key>pid</key>
<integer>1847</integer>
<integer>1942</integer>
<key>processname</key>
<string>photolibraryd</string>
<key>uid</key>

View File

@@ -33,6 +33,7 @@ ALBUMS = [
"Raw",
"I have a deleted twin", # there's an empty album with same name that has been deleted
"EmptyAlbum",
"2018-10 - Sponsion, Museum, Frühstück, Römermuseum",
]
KEYWORDS_DICT = {
"Kids": 4,
@@ -53,6 +54,7 @@ ALBUM_DICT = {
"Raw": 4,
"I have a deleted twin": 1,
"EmptyAlbum": 0,
"2018-10 - Sponsion, Museum, Frühstück, Römermuseum": 1,
} # Note: there are 2 albums named "Test Album" for testing duplicate album names
UUID_DICT = {

View File

@@ -58,6 +58,8 @@ CLI_EXPORT_FILENAMES = [
CLI_EXPORT_FILENAMES_ALBUM = ["Pumkins1.jpg", "Pumkins2.jpg", "Pumpkins3.jpg"]
CLI_EXPORT_FILENAMES_ALBUM_UNICODE = ["IMG_4547.jpg"]
CLI_EXPORT_FILENAMES_DELETED_TWIN = ["wedding.jpg", "wedding_edited.jpeg"]
CLI_EXPORT_EDITED_SUFFIX = "_bearbeiten"
@@ -451,7 +453,6 @@ def test_query_uuid():
"--json",
"--db",
os.path.join(cwd, CLI_PHOTOS_DB),
# "./tests/Test-10.15.1.photoslibrary",
"--uuid",
"D79B8D77-BFFC-460B-9312-034F2877D35B",
],
@@ -1816,6 +1817,26 @@ def test_export_album():
files = glob.glob("*")
assert sorted(files) == sorted(CLI_EXPORT_FILENAMES_ALBUM)
def test_export_album_unicode_name():
"""Test export of an album with non-English characters in name """
import glob
import os
import os.path
from osxphotos.__main__ import export
runner = CliRunner()
cwd = os.getcwd()
# pylint: disable=not-context-manager
with runner.isolated_filesystem():
result = runner.invoke(
export,
[os.path.join(cwd, PHOTOS_DB_15_6), ".", "--album", "2018-10 - Sponsion, Museum, Frühstück, Römermuseum", "-V"],
)
assert result.exit_code == 0
files = glob.glob("*")
assert sorted(files) == sorted(CLI_EXPORT_FILENAMES_ALBUM_UNICODE)
def test_export_album_deleted_twin():
"""Test export of an album where album of same name has been deleted """