#!/usr/bin/env python3 """ Check and Fix Data Issues This script checks for various data issues in the database and fixes them: 1. Duplicate contacts by Ethereum address 2. Contacts with multiple DAO memberships 3. Contacts with missing or generic names 4. Other data quality issues Usage: python check_and_fix_data.py """ import os import sys import logging from typing import Dict, Any, List, Optional, Tuple from dotenv import load_dotenv # Add parent directory to path to import utils sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils.db_connector import DatabaseConnector from utils.logger import setup_logger # Load environment variables load_dotenv() # Setup logging logger = setup_logger("data_checker_fixer") class DataCheckerFixer: """Checker and fixer for data issues""" def __init__(self): """Initialize the checker/fixer""" # Initialize database self.db = DatabaseConnector() def check_duplicate_addresses(self) -> List[Dict[str, Any]]: """Check for duplicate Ethereum addresses in the Contact table""" query = """ SELECT "ethereumAddress", COUNT(*) as count, array_agg(id) as contact_ids FROM "Contact" WHERE "ethereumAddress" IS NOT NULL GROUP BY "ethereumAddress" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC """ result = self.db.execute_query(query) logger.info(f"Found {len(result)} duplicate Ethereum addresses") return result def check_multi_dao_contacts(self) -> List[Dict[str, Any]]: """Check for contacts that belong to multiple DAOs""" query = """ SELECT c.id, c."ethereumAddress", c.name, COUNT(DISTINCT dm."daoName") as dao_count, array_agg(dm."daoName") as daos FROM "Contact" c JOIN "DaoMembership" dm ON c.id = dm."contactId" GROUP BY c.id, c."ethereumAddress", c.name HAVING COUNT(DISTINCT dm."daoName") > 1 ORDER BY dao_count DESC """ result = self.db.execute_query(query) logger.info(f"Found {len(result)} contacts that belong to multiple DAOs") # Print details for each multi-DAO contact for contact in result: logger.info(f"Contact {contact['name']} ({contact['ethereumAddress']}) belongs to {contact['dao_count']} DAOs: {', '.join(contact['daos'])}") return result def check_generic_names(self) -> List[Dict[str, Any]]: """Check for contacts with generic or missing names""" query = """ SELECT id, "ethereumAddress", name, "ensName" FROM "Contact" WHERE name IS NULL OR name = '' OR name = 'Raid Guild Member' OR name LIKE 'RG\\_%%' ESCAPE '\\' """ result = self.db.execute_query(query) logger.info(f"Found {len(result)} contacts with generic or missing names") return result def fix_generic_names(self) -> int: """Fix contacts with generic or missing names""" # Get contacts with generic names contacts = self.check_generic_names() fixed_count = 0 for contact in contacts: contact_id = contact["id"] eth_address = contact["ethereumAddress"] ens_name = contact["ensName"] # Generate a better name if ens_name: # Use ENS name without .eth suffix if ens_name.endswith('.eth'): name = ens_name[:-4] else: name = ens_name else: # Get DAO memberships dao_query = """ SELECT "daoName" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s """ dao_result = self.db.execute_query(dao_query, {"contact_id": contact_id}) dao_names = [dao["daoName"] for dao in dao_result] if dao_result else [] # Use first DAO as prefix for the shortened address if dao_names: dao_prefix = dao_names[0][:2] # First two letters of the DAO name name = f"{dao_prefix}_{eth_address[:6]}...{eth_address[-4:]}" else: name = f"ETH_{eth_address[:6]}...{eth_address[-4:]}" # Update the contact update_query = """ UPDATE "Contact" SET name = %(name)s, "updatedAt" = NOW() WHERE id = %(contact_id)s """ rows_updated = self.db.execute_update(update_query, { "contact_id": contact_id, "name": name }) if rows_updated > 0: logger.info(f"Updated name for contact {contact_id} to '{name}'") fixed_count += 1 logger.info(f"Fixed {fixed_count} contacts with generic or missing names") return fixed_count def check_case_sensitivity_issues(self) -> List[Dict[str, Any]]: """Check for case sensitivity issues in Ethereum addresses""" query = """ SELECT LOWER("ethereumAddress") as lower_address, array_agg("ethereumAddress") as addresses, array_agg(id) as contact_ids, COUNT(*) as count FROM "Contact" GROUP BY LOWER("ethereumAddress") HAVING COUNT(*) > 1 """ result = self.db.execute_query(query) logger.info(f"Found {len(result)} potential case sensitivity issues with Ethereum addresses") return result def fix_case_sensitivity_issues(self) -> int: """Fix case sensitivity issues in Ethereum addresses""" # Get case sensitivity issues issues = self.check_case_sensitivity_issues() fixed_count = 0 for issue in issues: lower_address = issue["lower_address"] addresses = issue["addresses"] contact_ids = issue["contact_ids"] logger.info(f"Fixing case sensitivity issue for address {lower_address}") logger.info(f" Found variations: {', '.join(addresses)}") try: # Start a transaction self.db.execute_query("BEGIN") # First, check if any contact already has the lowercase address existing_query = """ SELECT id FROM "Contact" WHERE "ethereumAddress" = %(lower_address)s """ existing = self.db.execute_query(existing_query, {"lower_address": lower_address}) if existing: # We have a contact with the lowercase address already # This is the primary contact we'll keep primary_id = existing[0]["id"] secondary_ids = [cid for cid in contact_ids if cid != primary_id] logger.info(f" Found existing contact {primary_id} with lowercase address") logger.info(f" Will merge {len(secondary_ids)} other contacts into it") # For each secondary contact, transfer its DAO memberships to the primary for secondary_id in secondary_ids: # Get DAO memberships for this secondary contact secondary_daos_query = """ SELECT id, "daoName", "daoType", "joinedAt" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s """ secondary_daos = self.db.execute_query(secondary_daos_query, {"contact_id": secondary_id}) # Get existing DAO memberships for the primary contact primary_daos_query = """ SELECT "daoName" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s """ primary_daos_result = self.db.execute_query(primary_daos_query, {"contact_id": primary_id}) primary_daos = [row["daoName"] for row in primary_daos_result] if primary_daos_result else [] # For each DAO membership of the secondary contact for dao in secondary_daos: dao_name = dao["daoName"] if dao_name in primary_daos: # Primary already has this membership, skip logger.info(f" Primary already has membership in {dao_name}, skipping") continue else: # Primary doesn't have this membership, transfer it update_query = """ UPDATE "DaoMembership" SET "contactId" = %(primary_id)s, "updatedAt" = NOW() WHERE id = %(membership_id)s """ self.db.execute_update(update_query, { "primary_id": primary_id, "membership_id": dao["id"] }) # Add to primary's DAO list to avoid duplicates in future iterations primary_daos.append(dao_name) logger.info(f" Transferred {dao_name} membership from secondary to primary contact") # Transfer notes notes_query = """ UPDATE "Note" SET "contactId" = %(primary_id)s, "updatedAt" = NOW() WHERE "contactId" = %(secondary_id)s """ self.db.execute_update(notes_query, { "primary_id": primary_id, "secondary_id": secondary_id }) # Delete the secondary contact delete_query = """ DELETE FROM "Contact" WHERE id = %(secondary_id)s """ self.db.execute_update(delete_query, {"secondary_id": secondary_id}) logger.info(f" Deleted secondary contact {secondary_id}") fixed_count += 1 else: # No contact has the lowercase address yet # Choose the first contact as primary and update its address to lowercase primary_id = contact_ids[0] secondary_ids = contact_ids[1:] logger.info(f" No contact with lowercase address found") logger.info(f" Will update contact {primary_id} to lowercase and merge {len(secondary_ids)} others into it") # Update primary contact to lowercase update_query = """ UPDATE "Contact" SET "ethereumAddress" = %(lower_address)s, "updatedAt" = NOW() WHERE id = %(primary_id)s """ self.db.execute_update(update_query, { "primary_id": primary_id, "lower_address": lower_address }) logger.info(f" Updated address for contact {primary_id} to lowercase") fixed_count += 1 # Now handle secondary contacts same as above # Get existing DAO memberships for the primary contact primary_daos_query = """ SELECT "daoName" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s """ primary_daos_result = self.db.execute_query(primary_daos_query, {"contact_id": primary_id}) primary_daos = [row["daoName"] for row in primary_daos_result] if primary_daos_result else [] for secondary_id in secondary_ids: # Get DAO memberships for this secondary contact secondary_daos_query = """ SELECT id, "daoName", "daoType", "joinedAt" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s """ secondary_daos = self.db.execute_query(secondary_daos_query, {"contact_id": secondary_id}) # For each DAO membership of the secondary contact for dao in secondary_daos: dao_name = dao["daoName"] if dao_name in primary_daos: # Primary already has this membership, skip logger.info(f" Primary already has membership in {dao_name}, skipping") continue else: # Primary doesn't have this membership, transfer it update_query = """ UPDATE "DaoMembership" SET "contactId" = %(primary_id)s, "updatedAt" = NOW() WHERE id = %(membership_id)s """ self.db.execute_update(update_query, { "primary_id": primary_id, "membership_id": dao["id"] }) # Add to primary's DAO list to avoid duplicates in future iterations primary_daos.append(dao_name) logger.info(f" Transferred {dao_name} membership from secondary to primary contact") # Transfer notes notes_query = """ UPDATE "Note" SET "contactId" = %(primary_id)s, "updatedAt" = NOW() WHERE "contactId" = %(secondary_id)s """ self.db.execute_update(notes_query, { "primary_id": primary_id, "secondary_id": secondary_id }) # Delete the secondary contact delete_query = """ DELETE FROM "Contact" WHERE id = %(secondary_id)s """ self.db.execute_update(delete_query, {"secondary_id": secondary_id}) logger.info(f" Deleted secondary contact {secondary_id}") fixed_count += 1 # Commit the transaction self.db.execute_query("COMMIT") logger.info(f" Successfully fixed case sensitivity issue for {lower_address}") except Exception as e: # Rollback on error self.db.execute_query("ROLLBACK") logger.error(f"Error fixing case sensitivity issue for {lower_address}: {e}") continue logger.info(f"Fixed {fixed_count} contacts with case sensitivity issues") return fixed_count def check_dao_membership_issues(self) -> List[Dict[str, Any]]: """Check for issues with DAO memberships""" query = """ SELECT dm."contactId", dm."daoName", COUNT(*) as count FROM "DaoMembership" dm GROUP BY dm."contactId", dm."daoName" HAVING COUNT(*) > 1 """ result = self.db.execute_query(query) logger.info(f"Found {len(result)} duplicate DAO memberships") return result def fix_dao_membership_issues(self) -> int: """Fix issues with DAO memberships""" # Get duplicate DAO memberships issues = self.check_dao_membership_issues() fixed_count = 0 for issue in issues: contact_id = issue["contactId"] dao_name = issue["daoName"] count = issue["count"] logger.info(f"Fixing duplicate DAO membership for contact {contact_id} in {dao_name} (found {count})") # Get all memberships for this contact and DAO membership_query = """ SELECT id, "joinedAt" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s AND "daoName" = %(dao_name)s ORDER BY "joinedAt" ASC NULLS LAST """ memberships = self.db.execute_query(membership_query, { "contact_id": contact_id, "dao_name": dao_name }) # Keep the one with the earliest join date keep_id = memberships[0]["id"] delete_ids = [m["id"] for m in memberships[1:]] logger.info(f" Keeping membership {keep_id}, deleting {len(delete_ids)} duplicates") # Delete duplicate memberships delete_query = """ DELETE FROM "DaoMembership" WHERE id IN %(delete_ids)s """ rows_deleted = self.db.execute_update(delete_query, { "delete_ids": tuple(delete_ids) }) if rows_deleted > 0: logger.info(f" Deleted {rows_deleted} duplicate memberships") fixed_count += rows_deleted logger.info(f"Fixed {fixed_count} duplicate DAO memberships") return fixed_count def run(self): """Run all checks and fixes""" logger.info("Starting data check and fix") # Check for various issues duplicate_addresses = self.check_duplicate_addresses() multi_dao_contacts = self.check_multi_dao_contacts() generic_names = self.check_generic_names() case_issues = self.check_case_sensitivity_issues() dao_issues = self.check_dao_membership_issues() # Fix issues if found fixed_count = 0 if case_issues: fixed_count += self.fix_case_sensitivity_issues() if duplicate_addresses: logger.warning("Found duplicate Ethereum addresses, but not fixing automatically") logger.warning("Please run the merge_duplicate_contacts.py script to fix these") if generic_names: fixed_count += self.fix_generic_names() if dao_issues: fixed_count += self.fix_dao_membership_issues() logger.info(f"Data check and fix completed. Fixed {fixed_count} issues.") # Final check for multi-DAO contacts (this is not an issue, just informational) final_multi_dao = self.check_multi_dao_contacts() return fixed_count def main(): """Main function""" try: checker = DataCheckerFixer() fixed_count = checker.run() logger.info(f"Data check and fix completed successfully. Fixed {fixed_count} issues.") return 0 except Exception as e: logger.exception(f"Error checking and fixing data: {e}") return 1 if __name__ == "__main__": sys.exit(main())