stones/scripts/utils/merge_duplicate_contacts.py

224 lines
9.2 KiB
Python

#!/usr/bin/env python3
"""
Merge Duplicate Contacts
This script finds and merges duplicate contacts in the database.
Duplicates are defined as contacts with the same Ethereum address.
Usage:
python merge_duplicate_contacts.py
"""
import os
import sys
import argparse
import psycopg2
from psycopg2.extras import RealDictCursor
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
def merge_duplicate_contacts():
"""
Find and merge duplicate contacts.
"""
# Get database connection string from environment variables
db_url = os.getenv("PYTHON_DATABASE_URL")
if not db_url:
db_url = os.getenv("DATABASE_URL").split("?schema=")[0]
# Connect to the database
conn = psycopg2.connect(db_url)
conn.autocommit = True
try:
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
# Find duplicate Ethereum addresses
query = """
SELECT "ethereumAddress", COUNT(*) as count
FROM "Contact"
GROUP BY "ethereumAddress"
HAVING COUNT(*) > 1
ORDER BY COUNT(*) DESC
"""
cursor.execute(query)
duplicates = cursor.fetchall()
print(f"Found {len(duplicates)} Ethereum addresses with duplicate contacts")
# Process each set of duplicates
total_merged = 0
for duplicate in duplicates:
eth_address = duplicate["ethereumAddress"]
# Get all contacts with this address
query = """
SELECT id, "ethereumAddress", "ensName", name, email,
twitter, discord, telegram, farcaster, "otherSocial",
"warpcastAddress", "ethereumAddress2", "createdAt"
FROM "Contact"
WHERE "ethereumAddress" = %s
ORDER BY "createdAt" ASC
"""
cursor.execute(query, (eth_address,))
contacts = cursor.fetchall()
# Skip if we somehow don't have duplicates
if len(contacts) <= 1:
continue
# Choose the oldest contact as the primary
primary_contact = contacts[0]
primary_id = primary_contact["id"]
print(f"Processing {len(contacts)} duplicates for address {eth_address}")
print(f" Primary contact: {primary_id}")
# Merge data from other contacts into the primary
for contact in contacts[1:]:
contact_id = contact["id"]
# Move NFT holdings
print(f" Moving NFT holdings from {contact_id} to {primary_id}")
query = """
INSERT INTO "NftHolding" (
id, "contactId", "contractAddress", "tokenId", "collectionName",
"acquiredAt", "createdAt", "updatedAt"
)
SELECT
gen_random_uuid(), %s, "contractAddress", "tokenId", "collectionName",
"acquiredAt", "createdAt", NOW()
FROM "NftHolding"
WHERE "contactId" = %s
ON CONFLICT ("contactId", "contractAddress", "tokenId") DO NOTHING
"""
cursor.execute(query, (primary_id, contact_id))
# Move token holdings
print(f" Moving token holdings from {contact_id} to {primary_id}")
query = """
INSERT INTO "TokenHolding" (
id, "contactId", "contractAddress", "tokenSymbol", balance,
"lastUpdated", "createdAt", "updatedAt"
)
SELECT
gen_random_uuid(), %s, "contractAddress", "tokenSymbol", balance,
"lastUpdated", "createdAt", NOW()
FROM "TokenHolding"
WHERE "contactId" = %s
ON CONFLICT ("contactId", "contractAddress") DO NOTHING
"""
cursor.execute(query, (primary_id, contact_id))
# Move DAO memberships
print(f" Moving DAO memberships from {contact_id} to {primary_id}")
query = """
INSERT INTO "DaoMembership" (
id, "contactId", "daoName", "daoType", "joinedAt", "createdAt", "updatedAt"
)
SELECT
gen_random_uuid(), %s, "daoName", "daoType", "joinedAt", "createdAt", NOW()
FROM "DaoMembership"
WHERE "contactId" = %s
ON CONFLICT ("contactId", "daoName") DO NOTHING
"""
cursor.execute(query, (primary_id, contact_id))
# Move notes
print(f" Moving notes from {contact_id} to {primary_id}")
query = """
INSERT INTO "Note" (
id, "contactId", content, "createdAt", "updatedAt"
)
SELECT
gen_random_uuid(), %s, content, "createdAt", NOW()
FROM "Note"
WHERE "contactId" = %s
"""
cursor.execute(query, (primary_id, contact_id))
# Move tags
print(f" Moving tags from {contact_id} to {primary_id}")
query = """
INSERT INTO "TagsOnContacts" (
"contactId", "tagId", "assignedAt"
)
SELECT
%s, "tagId", "assignedAt"
FROM "TagsOnContacts"
WHERE "contactId" = %s
ON CONFLICT ("contactId", "tagId") DO NOTHING
"""
cursor.execute(query, (primary_id, contact_id))
# Check if ContactSource table exists
query = """
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'ContactSource'
) as exists
"""
cursor.execute(query)
result = cursor.fetchone()
# Move contact sources if table exists
if result and result["exists"]:
print(f" Moving contact sources from {contact_id} to {primary_id}")
query = """
INSERT INTO "ContactSource" (
id, "contactId", "dataSourceId", "createdAt", "updatedAt"
)
SELECT
gen_random_uuid(), %s, "dataSourceId", "createdAt", NOW()
FROM "ContactSource"
WHERE "contactId" = %s
ON CONFLICT ("contactId", "dataSourceId") DO NOTHING
"""
cursor.execute(query, (primary_id, contact_id))
# Update primary contact with non-null values from this contact
update_fields = []
update_values = []
for field in ["ensName", "name", "email", "twitter", "discord",
"telegram", "farcaster", "otherSocial", "warpcastAddress",
"ethereumAddress2"]:
if contact[field] is not None and primary_contact[field] is None:
update_fields.append(f'"{field}" = %s')
update_values.append(contact[field])
print(f" Updating primary contact {field} to {contact[field]}")
if update_fields:
update_values.append(primary_id)
query = f"""
UPDATE "Contact"
SET {', '.join(update_fields)}, "updatedAt" = NOW()
WHERE id = %s
"""
cursor.execute(query, update_values)
# Delete the duplicate contact
print(f" Deleting duplicate contact {contact_id}")
query = """
DELETE FROM "Contact"
WHERE id = %s
"""
cursor.execute(query, (contact_id,))
total_merged += 1
print(f"Merged {total_merged} duplicate contacts")
finally:
conn.close()
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(description="Merge duplicate contacts")
args = parser.parse_args()
merge_duplicate_contacts()
if __name__ == "__main__":
main()