#!/usr/bin/env python3 """ Merge Duplicate Contacts This script finds and merges duplicate contacts in the database based on Ethereum addresses. It keeps the most complete record and updates all relationships to point to the primary contact. Usage: python merge_duplicate_contacts.py """ import os import sys import logging from typing import Dict, Any, List, Optional, Tuple from dotenv import load_dotenv # Add parent directory to path to import utils sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils.db_connector import DatabaseConnector from utils.logger import setup_logger # Load environment variables load_dotenv() # Setup logging logger = setup_logger("duplicate_contact_merger") class DuplicateContactMerger: """Merger for duplicate contacts""" def __init__(self): """Initialize the merger""" # Initialize database self.db = DatabaseConnector() def find_duplicate_contacts(self) -> List[Dict[str, Any]]: """Find duplicate contacts based on Ethereum address""" query = """ SELECT "ethereumAddress", COUNT(*) as count, array_agg(id) as contact_ids FROM "Contact" WHERE "ethereumAddress" IS NOT NULL GROUP BY "ethereumAddress" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC """ return self.db.execute_query(query) def get_contact_details(self, contact_id: str) -> Dict[str, Any]: """Get details for a specific contact""" query = """ SELECT id, "ethereumAddress", "ensName", name, "createdAt", "updatedAt" FROM "Contact" WHERE id = %(contact_id)s """ result = self.db.execute_query(query, {"contact_id": contact_id}) if result: return result[0] return {} def determine_primary_contact(self, contact_ids: List[str]) -> Tuple[str, List[str]]: """ Determine which contact should be the primary one to keep. Strategy: 1. Prefer contacts with ENS names 2. Prefer contacts with non-generic names 3. Prefer older contacts (earlier creation date) Returns: Tuple of (primary_contact_id, secondary_contact_ids) """ contacts = [self.get_contact_details(cid) for cid in contact_ids] # Sort contacts by our preference criteria sorted_contacts = sorted( contacts, key=lambda c: ( # Prefer contacts with ENS names (None sorts last) c.get("ensName") is None, # Prefer contacts with non-generic names c.get("name", "").startswith("RG_"), # Prefer older contacts c.get("createdAt") ) ) primary = sorted_contacts[0]["id"] secondaries = [c["id"] for c in sorted_contacts[1:]] return primary, secondaries def merge_contacts(self, primary_id: str, secondary_ids: List[str]) -> bool: """ Merge secondary contacts into the primary contact. Steps: 1. Update all relationships to point to the primary contact 2. Merge any missing data from secondaries into primary 3. Delete the secondary contacts Returns: True if successful, False otherwise """ try: # Start a transaction self.db.execute_query("BEGIN") # Get primary contact details primary = self.get_contact_details(primary_id) logger.info(f"Merging contacts into primary: {primary_id} ({primary.get('name')})") # Get existing DAO memberships for the primary contact primary_daos_query = """ SELECT "daoName" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s """ primary_daos_result = self.db.execute_query(primary_daos_query, {"contact_id": primary_id}) primary_daos = [row["daoName"] for row in primary_daos_result] if primary_daos_result else [] logger.info(f"Primary contact is already a member of: {', '.join(primary_daos) if primary_daos else 'No DAOs'}") # For each secondary contact, transfer its DAO memberships to the primary for secondary_id in secondary_ids: # Get DAO memberships for this secondary contact secondary_daos_query = """ SELECT id, "daoName", "daoType", "joinedAt" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s """ secondary_daos = self.db.execute_query(secondary_daos_query, {"contact_id": secondary_id}) if not secondary_daos: logger.info(f"Secondary contact {secondary_id} has no DAO memberships") continue logger.info(f"Secondary contact {secondary_id} is a member of: {', '.join([dao['daoName'] for dao in secondary_daos])}") # For each DAO membership of the secondary contact for dao in secondary_daos: dao_name = dao["daoName"] if dao_name in primary_daos: # Primary already has this membership, check if we need to update join date primary_dao_query = """ SELECT id, "joinedAt" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s AND "daoName" = %(dao_name)s """ primary_dao = self.db.execute_query(primary_dao_query, { "contact_id": primary_id, "dao_name": dao_name })[0] # If secondary has an earlier join date, update the primary's join date if dao["joinedAt"] and (not primary_dao["joinedAt"] or dao["joinedAt"] < primary_dao["joinedAt"]): update_query = """ UPDATE "DaoMembership" SET "joinedAt" = %(joined_at)s, "updatedAt" = NOW() WHERE id = %(membership_id)s """ self.db.execute_update(update_query, { "membership_id": primary_dao["id"], "joined_at": dao["joinedAt"] }) logger.info(f"Updated join date for {dao_name} membership of primary contact") else: # Primary doesn't have this membership, transfer it update_query = """ UPDATE "DaoMembership" SET "contactId" = %(primary_id)s, "updatedAt" = NOW() WHERE id = %(membership_id)s """ self.db.execute_update(update_query, { "primary_id": primary_id, "membership_id": dao["id"] }) # Add to primary's DAO list to avoid duplicates in future iterations primary_daos.append(dao_name) logger.info(f"Transferred {dao_name} membership from secondary to primary contact") # Update notes self.db.execute_update(""" UPDATE "Note" SET "contactId" = %(primary_id)s, "updatedAt" = NOW() WHERE "contactId" IN %(secondary_ids)s """, { "primary_id": primary_id, "secondary_ids": tuple(secondary_ids) }) logger.info(f"Transferred notes from secondary contacts to primary") # Update ENS name if primary doesn't have one but a secondary does if not primary.get("ensName"): ens_query = """ SELECT "ensName" FROM "Contact" WHERE id IN %(secondary_ids)s AND "ensName" IS NOT NULL LIMIT 1 """ ens_result = self.db.execute_query(ens_query, {"secondary_ids": tuple(secondary_ids)}) if ens_result and ens_result[0]["ensName"]: self.db.execute_update(""" UPDATE "Contact" SET "ensName" = %(ens_name)s, "updatedAt" = NOW() WHERE id = %(primary_id)s """, { "primary_id": primary_id, "ens_name": ens_result[0]["ensName"] }) logger.info(f"Updated primary contact with ENS name: {ens_result[0]['ensName']}") # Transfer any other social media info that primary might be missing for field in ["twitter", "discord", "telegram", "email", "farcaster", "otherSocial"]: social_query = f""" SELECT "{field}" FROM "Contact" WHERE id IN %(secondary_ids)s AND "{field}" IS NOT NULL AND "{field}" != '' LIMIT 1 """ social_result = self.db.execute_query(social_query, {"secondary_ids": tuple(secondary_ids)}) if social_result and social_result[0][field]: # Check if primary has this field primary_social_query = f""" SELECT "{field}" FROM "Contact" WHERE id = %(primary_id)s """ primary_social = self.db.execute_query(primary_social_query, {"primary_id": primary_id})[0] # Only update if primary doesn't have this field if not primary_social[field]: update_query = f""" UPDATE "Contact" SET "{field}" = %(value)s, "updatedAt" = NOW() WHERE id = %(primary_id)s """ self.db.execute_update(update_query, { "primary_id": primary_id, "value": social_result[0][field] }) logger.info(f"Updated primary contact with {field}: {social_result[0][field]}") # Delete secondary contacts self.db.execute_update(""" DELETE FROM "Contact" WHERE id IN %(secondary_ids)s """, { "secondary_ids": tuple(secondary_ids) }) # Commit the transaction self.db.execute_query("COMMIT") logger.info(f"Successfully merged {len(secondary_ids)} contacts into {primary_id}") return True except Exception as e: # Rollback on error self.db.execute_query("ROLLBACK") logger.error(f"Error merging contacts: {e}") return False def run(self): """Run the merger""" logger.info("Starting duplicate contact merger") # Find duplicate contacts duplicates = self.find_duplicate_contacts() logger.info(f"Found {len(duplicates)} Ethereum addresses with duplicate contacts") total_merged = 0 for dup in duplicates: eth_address = dup["ethereumAddress"] count = dup["count"] contact_ids = dup["contact_ids"] logger.info(f"Processing {count} duplicates for address {eth_address}") # Determine primary and secondary contacts primary_id, secondary_ids = self.determine_primary_contact(contact_ids) # Merge the contacts if self.merge_contacts(primary_id, secondary_ids): total_merged += len(secondary_ids) logger.info(f"Merged a total of {total_merged} duplicate contacts") return total_merged def main(): """Main function""" try: merger = DuplicateContactMerger() merged_count = merger.run() logger.info(f"Duplicate contact merging completed successfully. Merged {merged_count} contacts.") return 0 except Exception as e: logger.exception(f"Error merging duplicate contacts: {e}") return 1 if __name__ == "__main__": sys.exit(main())