478 lines
20 KiB
Python
478 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Check and Fix Data Issues
|
|
|
|
This script checks for various data issues in the database and fixes them:
|
|
1. Duplicate contacts by Ethereum address
|
|
2. Contacts with multiple DAO memberships
|
|
3. Contacts with missing or generic names
|
|
4. Other data quality issues
|
|
|
|
Usage:
|
|
python check_and_fix_data.py
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import logging
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from dotenv import load_dotenv
|
|
|
|
# Add parent directory to path to import utils
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from utils.db_connector import DatabaseConnector
|
|
from utils.logger import setup_logger
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Setup logging
|
|
logger = setup_logger("data_checker_fixer")
|
|
|
|
class DataCheckerFixer:
|
|
"""Checker and fixer for data issues"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the checker/fixer"""
|
|
# Initialize database
|
|
self.db = DatabaseConnector()
|
|
|
|
def check_duplicate_addresses(self) -> List[Dict[str, Any]]:
|
|
"""Check for duplicate Ethereum addresses in the Contact table"""
|
|
query = """
|
|
SELECT "ethereumAddress", COUNT(*) as count, array_agg(id) as contact_ids
|
|
FROM "Contact"
|
|
WHERE "ethereumAddress" IS NOT NULL
|
|
GROUP BY "ethereumAddress"
|
|
HAVING COUNT(*) > 1
|
|
ORDER BY COUNT(*) DESC
|
|
"""
|
|
|
|
result = self.db.execute_query(query)
|
|
logger.info(f"Found {len(result)} duplicate Ethereum addresses")
|
|
return result
|
|
|
|
def check_multi_dao_contacts(self) -> List[Dict[str, Any]]:
|
|
"""Check for contacts that belong to multiple DAOs"""
|
|
query = """
|
|
SELECT
|
|
c.id,
|
|
c."ethereumAddress",
|
|
c.name,
|
|
COUNT(DISTINCT dm."daoName") as dao_count,
|
|
array_agg(dm."daoName") as daos
|
|
FROM "Contact" c
|
|
JOIN "DaoMembership" dm ON c.id = dm."contactId"
|
|
GROUP BY c.id, c."ethereumAddress", c.name
|
|
HAVING COUNT(DISTINCT dm."daoName") > 1
|
|
ORDER BY dao_count DESC
|
|
"""
|
|
|
|
result = self.db.execute_query(query)
|
|
logger.info(f"Found {len(result)} contacts that belong to multiple DAOs")
|
|
|
|
# Print details for each multi-DAO contact
|
|
for contact in result:
|
|
logger.info(f"Contact {contact['name']} ({contact['ethereumAddress']}) belongs to {contact['dao_count']} DAOs: {', '.join(contact['daos'])}")
|
|
|
|
return result
|
|
|
|
def check_generic_names(self) -> List[Dict[str, Any]]:
|
|
"""Check for contacts with generic or missing names"""
|
|
query = """
|
|
SELECT id, "ethereumAddress", name, "ensName"
|
|
FROM "Contact"
|
|
WHERE name IS NULL
|
|
OR name = ''
|
|
OR name = 'Raid Guild Member'
|
|
OR name LIKE 'RG\\_%%' ESCAPE '\\'
|
|
"""
|
|
|
|
result = self.db.execute_query(query)
|
|
logger.info(f"Found {len(result)} contacts with generic or missing names")
|
|
return result
|
|
|
|
def fix_generic_names(self) -> int:
|
|
"""Fix contacts with generic or missing names"""
|
|
# Get contacts with generic names
|
|
contacts = self.check_generic_names()
|
|
|
|
fixed_count = 0
|
|
for contact in contacts:
|
|
contact_id = contact["id"]
|
|
eth_address = contact["ethereumAddress"]
|
|
ens_name = contact["ensName"]
|
|
|
|
# Generate a better name
|
|
if ens_name:
|
|
# Use ENS name without .eth suffix
|
|
if ens_name.endswith('.eth'):
|
|
name = ens_name[:-4]
|
|
else:
|
|
name = ens_name
|
|
else:
|
|
# Get DAO memberships
|
|
dao_query = """
|
|
SELECT "daoName" FROM "DaoMembership"
|
|
WHERE "contactId" = %(contact_id)s
|
|
"""
|
|
|
|
dao_result = self.db.execute_query(dao_query, {"contact_id": contact_id})
|
|
dao_names = [dao["daoName"] for dao in dao_result] if dao_result else []
|
|
|
|
# Use first DAO as prefix for the shortened address
|
|
if dao_names:
|
|
dao_prefix = dao_names[0][:2] # First two letters of the DAO name
|
|
name = f"{dao_prefix}_{eth_address[:6]}...{eth_address[-4:]}"
|
|
else:
|
|
name = f"ETH_{eth_address[:6]}...{eth_address[-4:]}"
|
|
|
|
# Update the contact
|
|
update_query = """
|
|
UPDATE "Contact"
|
|
SET name = %(name)s,
|
|
"updatedAt" = NOW()
|
|
WHERE id = %(contact_id)s
|
|
"""
|
|
|
|
rows_updated = self.db.execute_update(update_query, {
|
|
"contact_id": contact_id,
|
|
"name": name
|
|
})
|
|
|
|
if rows_updated > 0:
|
|
logger.info(f"Updated name for contact {contact_id} to '{name}'")
|
|
fixed_count += 1
|
|
|
|
logger.info(f"Fixed {fixed_count} contacts with generic or missing names")
|
|
return fixed_count
|
|
|
|
def check_case_sensitivity_issues(self) -> List[Dict[str, Any]]:
|
|
"""Check for case sensitivity issues in Ethereum addresses"""
|
|
query = """
|
|
SELECT LOWER("ethereumAddress") as lower_address,
|
|
array_agg("ethereumAddress") as addresses,
|
|
array_agg(id) as contact_ids,
|
|
COUNT(*) as count
|
|
FROM "Contact"
|
|
GROUP BY LOWER("ethereumAddress")
|
|
HAVING COUNT(*) > 1
|
|
"""
|
|
|
|
result = self.db.execute_query(query)
|
|
logger.info(f"Found {len(result)} potential case sensitivity issues with Ethereum addresses")
|
|
return result
|
|
|
|
def fix_case_sensitivity_issues(self) -> int:
|
|
"""Fix case sensitivity issues in Ethereum addresses"""
|
|
# Get case sensitivity issues
|
|
issues = self.check_case_sensitivity_issues()
|
|
|
|
fixed_count = 0
|
|
for issue in issues:
|
|
lower_address = issue["lower_address"]
|
|
addresses = issue["addresses"]
|
|
contact_ids = issue["contact_ids"]
|
|
|
|
logger.info(f"Fixing case sensitivity issue for address {lower_address}")
|
|
logger.info(f" Found variations: {', '.join(addresses)}")
|
|
|
|
try:
|
|
# Start a transaction
|
|
self.db.execute_query("BEGIN")
|
|
|
|
# First, check if any contact already has the lowercase address
|
|
existing_query = """
|
|
SELECT id FROM "Contact"
|
|
WHERE "ethereumAddress" = %(lower_address)s
|
|
"""
|
|
|
|
existing = self.db.execute_query(existing_query, {"lower_address": lower_address})
|
|
|
|
if existing:
|
|
# We have a contact with the lowercase address already
|
|
# This is the primary contact we'll keep
|
|
primary_id = existing[0]["id"]
|
|
secondary_ids = [cid for cid in contact_ids if cid != primary_id]
|
|
|
|
logger.info(f" Found existing contact {primary_id} with lowercase address")
|
|
logger.info(f" Will merge {len(secondary_ids)} other contacts into it")
|
|
|
|
# For each secondary contact, transfer its DAO memberships to the primary
|
|
for secondary_id in secondary_ids:
|
|
# Get DAO memberships for this secondary contact
|
|
secondary_daos_query = """
|
|
SELECT id, "daoName", "daoType", "joinedAt"
|
|
FROM "DaoMembership"
|
|
WHERE "contactId" = %(contact_id)s
|
|
"""
|
|
secondary_daos = self.db.execute_query(secondary_daos_query, {"contact_id": secondary_id})
|
|
|
|
# Get existing DAO memberships for the primary contact
|
|
primary_daos_query = """
|
|
SELECT "daoName" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s
|
|
"""
|
|
primary_daos_result = self.db.execute_query(primary_daos_query, {"contact_id": primary_id})
|
|
primary_daos = [row["daoName"] for row in primary_daos_result] if primary_daos_result else []
|
|
|
|
# For each DAO membership of the secondary contact
|
|
for dao in secondary_daos:
|
|
dao_name = dao["daoName"]
|
|
|
|
if dao_name in primary_daos:
|
|
# Primary already has this membership, skip
|
|
logger.info(f" Primary already has membership in {dao_name}, skipping")
|
|
continue
|
|
else:
|
|
# Primary doesn't have this membership, transfer it
|
|
update_query = """
|
|
UPDATE "DaoMembership"
|
|
SET "contactId" = %(primary_id)s,
|
|
"updatedAt" = NOW()
|
|
WHERE id = %(membership_id)s
|
|
"""
|
|
|
|
self.db.execute_update(update_query, {
|
|
"primary_id": primary_id,
|
|
"membership_id": dao["id"]
|
|
})
|
|
|
|
# Add to primary's DAO list to avoid duplicates in future iterations
|
|
primary_daos.append(dao_name)
|
|
logger.info(f" Transferred {dao_name} membership from secondary to primary contact")
|
|
|
|
# Transfer notes
|
|
notes_query = """
|
|
UPDATE "Note"
|
|
SET "contactId" = %(primary_id)s,
|
|
"updatedAt" = NOW()
|
|
WHERE "contactId" = %(secondary_id)s
|
|
"""
|
|
|
|
self.db.execute_update(notes_query, {
|
|
"primary_id": primary_id,
|
|
"secondary_id": secondary_id
|
|
})
|
|
|
|
# Delete the secondary contact
|
|
delete_query = """
|
|
DELETE FROM "Contact"
|
|
WHERE id = %(secondary_id)s
|
|
"""
|
|
|
|
self.db.execute_update(delete_query, {"secondary_id": secondary_id})
|
|
logger.info(f" Deleted secondary contact {secondary_id}")
|
|
fixed_count += 1
|
|
else:
|
|
# No contact has the lowercase address yet
|
|
# Choose the first contact as primary and update its address to lowercase
|
|
primary_id = contact_ids[0]
|
|
secondary_ids = contact_ids[1:]
|
|
|
|
logger.info(f" No contact with lowercase address found")
|
|
logger.info(f" Will update contact {primary_id} to lowercase and merge {len(secondary_ids)} others into it")
|
|
|
|
# Update primary contact to lowercase
|
|
update_query = """
|
|
UPDATE "Contact"
|
|
SET "ethereumAddress" = %(lower_address)s,
|
|
"updatedAt" = NOW()
|
|
WHERE id = %(primary_id)s
|
|
"""
|
|
|
|
self.db.execute_update(update_query, {
|
|
"primary_id": primary_id,
|
|
"lower_address": lower_address
|
|
})
|
|
|
|
logger.info(f" Updated address for contact {primary_id} to lowercase")
|
|
fixed_count += 1
|
|
|
|
# Now handle secondary contacts same as above
|
|
# Get existing DAO memberships for the primary contact
|
|
primary_daos_query = """
|
|
SELECT "daoName" FROM "DaoMembership" WHERE "contactId" = %(contact_id)s
|
|
"""
|
|
primary_daos_result = self.db.execute_query(primary_daos_query, {"contact_id": primary_id})
|
|
primary_daos = [row["daoName"] for row in primary_daos_result] if primary_daos_result else []
|
|
|
|
for secondary_id in secondary_ids:
|
|
# Get DAO memberships for this secondary contact
|
|
secondary_daos_query = """
|
|
SELECT id, "daoName", "daoType", "joinedAt"
|
|
FROM "DaoMembership"
|
|
WHERE "contactId" = %(contact_id)s
|
|
"""
|
|
secondary_daos = self.db.execute_query(secondary_daos_query, {"contact_id": secondary_id})
|
|
|
|
# For each DAO membership of the secondary contact
|
|
for dao in secondary_daos:
|
|
dao_name = dao["daoName"]
|
|
|
|
if dao_name in primary_daos:
|
|
# Primary already has this membership, skip
|
|
logger.info(f" Primary already has membership in {dao_name}, skipping")
|
|
continue
|
|
else:
|
|
# Primary doesn't have this membership, transfer it
|
|
update_query = """
|
|
UPDATE "DaoMembership"
|
|
SET "contactId" = %(primary_id)s,
|
|
"updatedAt" = NOW()
|
|
WHERE id = %(membership_id)s
|
|
"""
|
|
|
|
self.db.execute_update(update_query, {
|
|
"primary_id": primary_id,
|
|
"membership_id": dao["id"]
|
|
})
|
|
|
|
# Add to primary's DAO list to avoid duplicates in future iterations
|
|
primary_daos.append(dao_name)
|
|
logger.info(f" Transferred {dao_name} membership from secondary to primary contact")
|
|
|
|
# Transfer notes
|
|
notes_query = """
|
|
UPDATE "Note"
|
|
SET "contactId" = %(primary_id)s,
|
|
"updatedAt" = NOW()
|
|
WHERE "contactId" = %(secondary_id)s
|
|
"""
|
|
|
|
self.db.execute_update(notes_query, {
|
|
"primary_id": primary_id,
|
|
"secondary_id": secondary_id
|
|
})
|
|
|
|
# Delete the secondary contact
|
|
delete_query = """
|
|
DELETE FROM "Contact"
|
|
WHERE id = %(secondary_id)s
|
|
"""
|
|
|
|
self.db.execute_update(delete_query, {"secondary_id": secondary_id})
|
|
logger.info(f" Deleted secondary contact {secondary_id}")
|
|
fixed_count += 1
|
|
|
|
# Commit the transaction
|
|
self.db.execute_query("COMMIT")
|
|
logger.info(f" Successfully fixed case sensitivity issue for {lower_address}")
|
|
|
|
except Exception as e:
|
|
# Rollback on error
|
|
self.db.execute_query("ROLLBACK")
|
|
logger.error(f"Error fixing case sensitivity issue for {lower_address}: {e}")
|
|
continue
|
|
|
|
logger.info(f"Fixed {fixed_count} contacts with case sensitivity issues")
|
|
return fixed_count
|
|
|
|
def check_dao_membership_issues(self) -> List[Dict[str, Any]]:
|
|
"""Check for issues with DAO memberships"""
|
|
query = """
|
|
SELECT dm."contactId", dm."daoName", COUNT(*) as count
|
|
FROM "DaoMembership" dm
|
|
GROUP BY dm."contactId", dm."daoName"
|
|
HAVING COUNT(*) > 1
|
|
"""
|
|
|
|
result = self.db.execute_query(query)
|
|
logger.info(f"Found {len(result)} duplicate DAO memberships")
|
|
return result
|
|
|
|
def fix_dao_membership_issues(self) -> int:
|
|
"""Fix issues with DAO memberships"""
|
|
# Get duplicate DAO memberships
|
|
issues = self.check_dao_membership_issues()
|
|
|
|
fixed_count = 0
|
|
for issue in issues:
|
|
contact_id = issue["contactId"]
|
|
dao_name = issue["daoName"]
|
|
count = issue["count"]
|
|
|
|
logger.info(f"Fixing duplicate DAO membership for contact {contact_id} in {dao_name} (found {count})")
|
|
|
|
# Get all memberships for this contact and DAO
|
|
membership_query = """
|
|
SELECT id, "joinedAt"
|
|
FROM "DaoMembership"
|
|
WHERE "contactId" = %(contact_id)s AND "daoName" = %(dao_name)s
|
|
ORDER BY "joinedAt" ASC NULLS LAST
|
|
"""
|
|
|
|
memberships = self.db.execute_query(membership_query, {
|
|
"contact_id": contact_id,
|
|
"dao_name": dao_name
|
|
})
|
|
|
|
# Keep the one with the earliest join date
|
|
keep_id = memberships[0]["id"]
|
|
delete_ids = [m["id"] for m in memberships[1:]]
|
|
|
|
logger.info(f" Keeping membership {keep_id}, deleting {len(delete_ids)} duplicates")
|
|
|
|
# Delete duplicate memberships
|
|
delete_query = """
|
|
DELETE FROM "DaoMembership"
|
|
WHERE id IN %(delete_ids)s
|
|
"""
|
|
|
|
rows_deleted = self.db.execute_update(delete_query, {
|
|
"delete_ids": tuple(delete_ids)
|
|
})
|
|
|
|
if rows_deleted > 0:
|
|
logger.info(f" Deleted {rows_deleted} duplicate memberships")
|
|
fixed_count += rows_deleted
|
|
|
|
logger.info(f"Fixed {fixed_count} duplicate DAO memberships")
|
|
return fixed_count
|
|
|
|
def run(self):
|
|
"""Run all checks and fixes"""
|
|
logger.info("Starting data check and fix")
|
|
|
|
# Check for various issues
|
|
duplicate_addresses = self.check_duplicate_addresses()
|
|
multi_dao_contacts = self.check_multi_dao_contacts()
|
|
generic_names = self.check_generic_names()
|
|
case_issues = self.check_case_sensitivity_issues()
|
|
dao_issues = self.check_dao_membership_issues()
|
|
|
|
# Fix issues if found
|
|
fixed_count = 0
|
|
|
|
if case_issues:
|
|
fixed_count += self.fix_case_sensitivity_issues()
|
|
|
|
if duplicate_addresses:
|
|
logger.warning("Found duplicate Ethereum addresses, but not fixing automatically")
|
|
logger.warning("Please run the merge_duplicate_contacts.py script to fix these")
|
|
|
|
if generic_names:
|
|
fixed_count += self.fix_generic_names()
|
|
|
|
if dao_issues:
|
|
fixed_count += self.fix_dao_membership_issues()
|
|
|
|
logger.info(f"Data check and fix completed. Fixed {fixed_count} issues.")
|
|
|
|
# Final check for multi-DAO contacts (this is not an issue, just informational)
|
|
final_multi_dao = self.check_multi_dao_contacts()
|
|
|
|
return fixed_count
|
|
|
|
def main():
|
|
"""Main function"""
|
|
try:
|
|
checker = DataCheckerFixer()
|
|
fixed_count = checker.run()
|
|
logger.info(f"Data check and fix completed successfully. Fixed {fixed_count} issues.")
|
|
return 0
|
|
except Exception as e:
|
|
logger.exception(f"Error checking and fixing data: {e}")
|
|
return 1
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main()) |