Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions synapse/storage/databases/main/media_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def __init__(
update_name="local_media_repository_sha256_idx",
index_name="local_media_repository_sha256",
table="local_media_repository",
where_clause="WHERE sha256 IS NOT NULL",
where_clause="sha256 IS NOT NULL",
columns=[
"sha256",
],
Expand All @@ -173,7 +173,7 @@ def __init__(
update_name="remote_media_cache_sha256_idx",
index_name="remote_media_cache_sha256",
table="remote_media_cache",
where_clause="WHERE sha256 IS NOT NULL",
where_clause="sha256 IS NOT NULL",
columns=[
"sha256",
],
Expand Down
92 changes: 57 additions & 35 deletions synapse/storage/databases/main/room.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@
from synapse.config.homeserver import HomeServerConfig
from synapse.events import EventBase
from synapse.replication.tcp.streams.partial_state import UnPartialStatedRoomStream
from synapse.storage._base import db_to_json, make_in_list_sql_clause
from synapse.storage._base import (
db_to_json,
make_in_list_sql_clause,
)
from synapse.storage.database import (
DatabasePool,
LoggingDatabaseConnection,
LoggingTransaction,
make_tuple_in_list_sql_clause,
)
from synapse.storage.databases.main.cache import CacheInvalidationWorkerStore
from synapse.storage.types import Cursor
Expand Down Expand Up @@ -1150,15 +1154,14 @@ def _quarantine_local_media_txn(

# First, determine the hashes of the media we want to delete locally.
# We also want the media_ids for any media that lacks a hash.
hash_sql = (
"SELECT sha256, media_id FROM local_media_repository WHERE media_id IN ("
+ ", ".join(["?"] * len(mxcs))
+ ")"
hash_sql_many_clause_sql, hash_sql_many_clause_args = make_in_list_sql_clause(
txn.database_engine, "media_id", mxcs
)
hash_sql = f"SELECT sha256, media_id FROM local_media_repository WHERE {hash_sql_many_clause_sql}"
if quarantined_by is not None:
hash_sql += " AND safe_from_quarantine = FALSE"

txn.execute(hash_sql, mxcs)
txn.execute(hash_sql, hash_sql_many_clause_args)
results = txn.fetchall()

# Split results into hashes, and hashless media.
Expand All @@ -1171,34 +1174,40 @@ def _quarantine_local_media_txn(
non_hashed_media_ids.add(row[1])

total_media_quarantined = 0

# Effectively a legacy path, update any media
# that was explicitly named.
if len(non_hashed_media_ids):
sql = """
if non_hashed_media_ids:
sql_many_clause_sql, sql_many_clause_args = make_in_list_sql_clause(
txn.database_engine, "media_id", non_hashed_media_ids
)
sql = f"""
UPDATE local_media_repository
SET quarantined_by = ?
WHERE media_id = ?"""
WHERE {sql_many_clause_sql}"""

if quarantined_by is not None:
sql += " AND safe_from_quarantine = FALSE"

txn.executemany(sql, [(quarantined_by, x) for x in non_hashed_media_ids])
txn.execute(sql, [quarantined_by] + sql_many_clause_args)
total_media_quarantined += txn.rowcount if txn.rowcount > 0 else 0

# Update any media that was identified via hash.
if len(hashes):
if hashes:
# Update all the tables to set the quarantined_by flag
# We also pick up any media with a matching hash.
sql = """
sql_many_clause_sql, sql_many_clause_args = make_in_list_sql_clause(
txn.database_engine, "sha256", hashes
)
sql = f"""
UPDATE local_media_repository
SET quarantined_by = ?
WHERE sha256 = ?"""
WHERE {sql_many_clause_sql}"""

# set quarantine
if quarantined_by is not None:
sql += " AND safe_from_quarantine = FALSE"

txn.executemany(sql, [(quarantined_by, x) for x in hashes])
txn.execute(sql, [quarantined_by] + sql_many_clause_args)
# Note that a rowcount of -1 can be used to indicate no rows were affected.
total_media_quarantined += txn.rowcount if txn.rowcount > 0 else 0

Expand Down Expand Up @@ -1227,43 +1236,56 @@ def _quarantine_remote_media_txn(

# First, determine the hashes of the media we want to delete locally.
# We also want the media_ids for any media that lacks a hash.

hashes = set()
non_hashed_media_ids = set()
for media_origin, media_id in mxcs:
hash_sql = "SELECT sha256, media_id, media_origin FROM remote_media_cache WHERE media_origin = ? AND media_id = ?"
txn.execute(hash_sql, (media_origin, media_id))
# Split results into hashes, and hashless media.
row = txn.fetchone()
if row:
if row[0]:
hashes.add(row[0])
else:
non_hashed_media_ids.add((row[1], row[2]))

hash_sql_in_list_clause, hash_sql_args = make_tuple_in_list_sql_clause(
txn.database_engine,
("media_origin", "media_id"),
mxcs,
)

hash_sql = f"SELECT sha256, media_id, media_origin FROM remote_media_cache WHERE {hash_sql_in_list_clause}"
txn.execute(hash_sql, hash_sql_args)
# Split results into hashes, and hashless media.
row = txn.fetchone()
if row:
if row[0]:
hashes.add(row[0])
else:
non_hashed_media_ids.add((row[1], row[2]))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably intended for row in txn: instead of the fetchone and if.

For readability, also consider unpacking the row e.g. for sha256, media_id, media_origin in txn:

(I think this would also reveal that this current implementation is swapping media_id and media_origin?)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using for x,y in txn now :)


total_media_quarantined = 0
# Effectively a legacy path, update any media
# that was explicitly named.
if len(non_hashed_media_ids):
sql = """
if non_hashed_media_ids:
sql_in_list_clause, sql_args = make_tuple_in_list_sql_clause(
txn.database_engine,
("media_origin", "media_id"),
non_hashed_media_ids,
)
sql = f"""
UPDATE remote_media_cache
SET quarantined_by = ?
WHERE media_id = ? AND media_origin = ?"""
WHERE {sql_in_list_clause}"""

txn.executemany(
sql, [(quarantined_by, x, y) for x, y in non_hashed_media_ids]
)
txn.execute(sql, [quarantined_by] + sql_args)
total_media_quarantined += txn.rowcount if txn.rowcount > 0 else 0

total_media_quarantined = 0
# Update any media that was identified via hash.
if len(hashes):
if hashes:
sql_many_clause_sql, sql_many_clause_args = make_in_list_sql_clause(
txn.database_engine, "sha256", hashes
)
# Update all the tables to set the quarantined_by flag
# We also pick up any media with a matching hash.
sql = """
sql = f"""
UPDATE remote_media_cache
SET quarantined_by = ?
WHERE sha256 = ?"""
txn.executemany(sql, [(quarantined_by, x) for x in hashes])
WHERE {sql_many_clause_sql}"""
txn.execute(sql, [quarantined_by] + sql_many_clause_args)
# Note that a rowcount of -1 can be used to indicate no rows were affected.
total_media_quarantined += txn.rowcount if txn.rowcount > 0 else 0

Expand Down
2 changes: 1 addition & 1 deletion synapse/storage/databases/main/user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
import icu

USE_ICU = True
except ModuleNotFoundError:
except Exception:
USE_ICU = False

from synapse.api.errors import StoreError
Expand Down
5 changes: 5 additions & 0 deletions synapse/storage/schema/main/delta/91/01_media_hash.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,8 @@
-- Store the SHA256 content hash of media files.
ALTER TABLE local_media_repository ADD COLUMN sha256 TEXT;
ALTER TABLE remote_media_cache ADD COLUMN sha256 TEXT;

-- Add a background updates to handle creating the new index.
INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
(9101, 'local_media_repository_sha256_idx', '{}'),
(9101, 'remote_media_cache_sha256_idx', '{}');