#!/usr/bin/env python3
"""
CloudCraze Data Extraction Script for Salesforce B2B Commerce Migration
Extracts CCRZ objects with relationship validation and data quality reporting
Author: Senior Developer, NULogic
Version: 1.0
"""

import os
import csv
import json
import logging
import hashlib
from datetime import datetime
from typing import Dict, List, Tuple, Any
from simple_salesforce import Salesforce
import yaml

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('migration_extraction.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


class CCRZDataExtractor:
    """Extract CloudCraze data from Salesforce org"""

    def __init__(self, config_path: str = 'migration-config.yaml'):
        """Initialize extractor with configuration"""
        self.config = self._load_config(config_path)
        self.sf = None
        self.extracted_data = {}
        self.data_quality_report = {}
        self.checksums = {}
        self.extraction_start_time = None
        self.record_counts = {}

    def _load_config(self, config_path: str) -> Dict[str, Any]:
        """Load YAML configuration file"""
        try:
            with open(config_path, 'r') as f:
                config = yaml.safe_load(f)
            logger.info(f'Configuration loaded from {config_path}')
            return config
        except FileNotFoundError:
            logger.error(f'Configuration file not found: {config_path}')
            raise

    def connect_to_org(self, client_id: str, client_secret: str,
                       username: str, password: str) -> bool:
        """Establish OAuth connection to source CCRZ org"""
        try:
            self.sf = Salesforce(
                client_id=client_id,
                client_secret=client_secret,
                username=username,
                password=password,
                sandbox=self.config.get('source_sandbox', False)
            )
            logger.info(f'Successfully connected to org: {username}')
            return True
        except Exception as e:
            logger.error(f'Failed to connect to org: {str(e)}')
            return False

    def extract_all_data(self) -> bool:
        """Execute full extraction process"""
        self.extraction_start_time = datetime.now()
        logger.info('Starting CloudCraze data extraction')

        try:
            # Extract products
            self._extract_products()

            # Extract pricing
            self._extract_pricelists()
            self._extract_pricelist_items()

            # Extract orders
            self._extract_orders()
            self._extract_order_items()
            self._extract_order_payments()

            # Extract supplementary data
            self._extract_categories()
            self._extract_account_groups()

            # Validate relationships
            self._validate_relationships()

            # Generate quality report
            self._generate_quality_report()

            # Export to CSV
            self._export_to_csv()

            logger.info('Data extraction completed successfully')
            return True

        except Exception as e:
            logger.error(f'Extraction failed: {str(e)}')
            return False

    def _extract_products(self):
        """Extract E_Product__c records"""
        logger.info('Extracting products...')

        query = """
            SELECT Id, ccrz__ProductCode__c, ccrz__ProductName__c,
                   ccrz__ProductDescription__c, ccrz__ProductType__c,
                   ccrz__ProductFamily__c, ccrz__Manufacturer__c,
                   ccrz__Weight__c, ccrz__IsActive__c, CreatedDate, LastModifiedDate
            FROM ccrz__E_Product__c
            ORDER BY ccrz__ProductCode__c
        """

        try:
            results = self.sf.query_all(query)
            records = results.get('records', [])

            self.extracted_data['products'] = records
            self.record_counts['products'] = len(records)

            # Calculate checksum
            self.checksums['products'] = self._calculate_checksum(records)

            logger.info(f'Extracted {len(records)} products')

            # Quality checks
            self._check_product_quality(records)

        except Exception as e:
            logger.error(f'Failed to extract products: {str(e)}')
            raise

    def _extract_pricelists(self):
        """Extract E_PriceList__c records"""
        logger.info('Extracting pricelists...')

        query = """
            SELECT Id, ccrz__PriceListCode__c, ccrz__PriceListName__c,
                   ccrz__CurrencyCode__c, ccrz__IsActive__c,
                   ccrz__EffectiveDate__c, ccrz__ExpirationDate__c,
                   CreatedDate, LastModifiedDate
            FROM ccrz__E_PriceList__c
            ORDER BY ccrz__PriceListCode__c
        """

        try:
            results = self.sf.query_all(query)
            records = results.get('records', [])

            self.extracted_data['pricelists'] = records
            self.record_counts['pricelists'] = len(records)
            self.checksums['pricelists'] = self._calculate_checksum(records)

            logger.info(f'Extracted {len(records)} pricelists')

        except Exception as e:
            logger.error(f'Failed to extract pricelists: {str(e)}')
            raise

    def _extract_pricelist_items(self):
        """Extract E_PriceListItem__c records"""
        logger.info('Extracting pricelist items...')

        query = """
            SELECT Id, ccrz__PriceList__c, ccrz__Product__c,
                   ccrz__ListPrice__c, ccrz__UnitPrice__c,
                   ccrz__DiscountPercent__c, ccrz__VolumeTierStartQty__c,
                   ccrz__VolumeTierEndQty__c, ccrz__IsActive__c,
                   CreatedDate, LastModifiedDate
            FROM ccrz__E_PriceListItem__c
            ORDER BY ccrz__PriceList__c
        """

        try:
            results = self.sf.query_all(query)
            records = results.get('records', [])

            self.extracted_data['pricelist_items'] = records
            self.record_counts['pricelist_items'] = len(records)
            self.checksums['pricelist_items'] = self._calculate_checksum(records)

            logger.info(f'Extracted {len(records)} pricelist items')

            # Quality checks
            self._check_pricing_quality(records)

        except Exception as e:
            logger.error(f'Failed to extract pricelist items: {str(e)}')
            raise

    def _extract_orders(self):
        """Extract E_Order__c records"""
        logger.info('Extracting orders...')

        query = """
            SELECT Id, ccrz__OrderNumber__c, ccrz__Account__c,
                   ccrz__OrderDate__c, ccrz__OrderStatus__c,
                   ccrz__OrderTotal__c, ccrz__OrderSubtotal__c,
                   ccrz__OrderTax__c, ccrz__OrderShipping__c,
                   ccrz__ShippingMethod__c, ccrz__PONumber__c,
                   CreatedDate, LastModifiedDate
            FROM ccrz__E_Order__c
            ORDER BY ccrz__OrderDate__c DESC
        """

        try:
            results = self.sf.query_all(query)
            records = results.get('records', [])

            self.extracted_data['orders'] = records
            self.record_counts['orders'] = len(records)
            self.checksums['orders'] = self._calculate_checksum(records)

            logger.info(f'Extracted {len(records)} orders')

            # Quality checks
            self._check_order_quality(records)

        except Exception as e:
            logger.error(f'Failed to extract orders: {str(e)}')
            raise

    def _extract_order_items(self):
        """Extract E_OrderItem__c records"""
        logger.info('Extracting order items...')

        query = """
            SELECT Id, ccrz__Order__c, ccrz__Product__c,
                   ccrz__Quantity__c, ccrz__UnitPrice__c,
                   ccrz__LineTotal__c, ccrz__TaxAmount__c,
                   CreatedDate, LastModifiedDate
            FROM ccrz__E_OrderItem__c
            ORDER BY ccrz__Order__c
        """

        try:
            results = self.sf.query_all(query)
            records = results.get('records', [])

            self.extracted_data['order_items'] = records
            self.record_counts['order_items'] = len(records)
            self.checksums['order_items'] = self._calculate_checksum(records)

            logger.info(f'Extracted {len(records)} order items')

        except Exception as e:
            logger.error(f'Failed to extract order items: {str(e)}')
            raise

    def _extract_order_payments(self):
        """Extract E_TransactionPayment__c records"""
        logger.info('Extracting order payments...')

        query = """
            SELECT Id, ccrz__Order__c, ccrz__PaymentMethod__c,
                   ccrz__TransactionAmount__c, ccrz__TransactionDate__c,
                   ccrz__PaymentStatus__c, ccrz__AuthorizationCode__c,
                   CreatedDate, LastModifiedDate
            FROM ccrz__E_TransactionPayment__c
            ORDER BY ccrz__Order__c
        """

        try:
            results = self.sf.query_all(query)
            records = results.get('records', [])

            self.extracted_data['payments'] = records
            self.record_counts['payments'] = len(records)
            self.checksums['payments'] = self._calculate_checksum(records)

            logger.info(f'Extracted {len(records)} payment records')

        except Exception as e:
            logger.error(f'Failed to extract payments: {str(e)}')
            raise

    def _extract_categories(self):
        """Extract product category data"""
        logger.info('Extracting categories...')

        query = """
            SELECT Id, ccrz__CategoryName__c, ccrz__ParentCategory__c
            FROM ccrz__E_ProductCategory__c
        """

        try:
            results = self.sf.query_all(query)
            records = results.get('records', [])

            self.extracted_data['categories'] = records
            self.record_counts['categories'] = len(records)

            logger.info(f'Extracted {len(records)} categories')

        except Exception as e:
            logger.error(f'Failed to extract categories: {str(e)}')

    def _extract_account_groups(self):
        """Extract E_AccountGroup__c for buyer group mappings"""
        logger.info('Extracting account groups...')

        query = """
            SELECT Id, ccrz__GroupName__c, ccrz__Account__c
            FROM ccrz__E_AccountGroup__c
        """

        try:
            results = self.sf.query_all(query)
            records = results.get('records', [])

            self.extracted_data['account_groups'] = records
            self.record_counts['account_groups'] = len(records)

            logger.info(f'Extracted {len(records)} account groups')

        except Exception as e:
            logger.error(f'Failed to extract account groups: {str(e)}')

    def _validate_relationships(self):
        """Validate referential integrity"""
        logger.info('Validating data relationships...')

        # Check for orphan order items
        if 'order_items' in self.extracted_data and 'orders' in self.extracted_data:
            order_ids = set(o['Id'] for o in self.extracted_data['orders'])
            orphan_items = [
                item for item in self.extracted_data['order_items']
                if item.get('ccrz__Order__c') not in order_ids
            ]
            if orphan_items:
                logger.warning(f'Found {len(orphan_items)} orphan order items')
                self.data_quality_report['orphan_order_items'] = len(orphan_items)

        # Check for orphan pricelist items
        if 'pricelist_items' in self.extracted_data and 'pricelists' in self.extracted_data:
            pricelist_ids = set(p['Id'] for p in self.extracted_data['pricelists'])
            orphan_entries = [
                item for item in self.extracted_data['pricelist_items']
                if item.get('ccrz__PriceList__c') not in pricelist_ids
            ]
            if orphan_entries:
                logger.warning(f'Found {len(orphan_entries)} orphan pricelist items')
                self.data_quality_report['orphan_pricelist_items'] = len(orphan_entries)

        logger.info('Relationship validation complete')

    def _check_product_quality(self, records: List[Dict]):
        """Check product data quality"""
        quality_issues = {
            'missing_product_code': 0,
            'missing_product_name': 0,
            'inactive_products': 0
        }

        for record in records:
            if not record.get('ccrz__ProductCode__c'):
                quality_issues['missing_product_code'] += 1
            if not record.get('ccrz__ProductName__c'):
                quality_issues['missing_product_name'] += 1
            if not record.get('ccrz__IsActive__c'):
                quality_issues['inactive_products'] += 1

        self.data_quality_report['products'] = quality_issues
        logger.info(f'Product quality report: {quality_issues}')

    def _check_pricing_quality(self, records: List[Dict]):
        """Check pricing data quality"""
        quality_issues = {
            'missing_unit_price': 0,
            'negative_prices': 0,
            'null_discount': 0
        }

        for record in records:
            if not record.get('ccrz__UnitPrice__c'):
                quality_issues['missing_unit_price'] += 1
            if record.get('ccrz__UnitPrice__c') and record['ccrz__UnitPrice__c'] < 0:
                quality_issues['negative_prices'] += 1
            if record.get('ccrz__DiscountPercent__c') is None:
                quality_issues['null_discount'] += 1

        self.data_quality_report['pricing'] = quality_issues
        logger.info(f'Pricing quality report: {quality_issues}')

    def _check_order_quality(self, records: List[Dict]):
        """Check order data quality"""
        quality_issues = {
            'missing_order_number': 0,
            'missing_order_date': 0,
            'negative_totals': 0
        }

        for record in records:
            if not record.get('ccrz__OrderNumber__c'):
                quality_issues['missing_order_number'] += 1
            if not record.get('ccrz__OrderDate__c'):
                quality_issues['missing_order_date'] += 1
            if record.get('ccrz__OrderTotal__c') and record['ccrz__OrderTotal__c'] < 0:
                quality_issues['negative_totals'] += 1

        self.data_quality_report['orders'] = quality_issues
        logger.info(f'Order quality report: {quality_issues}')

    def _calculate_checksum(self, records: List[Dict]) -> str:
        """Calculate SHA256 checksum of records"""
        data_string = json.dumps(records, sort_keys=True, default=str)
        return hashlib.sha256(data_string.encode()).hexdigest()

    def _generate_quality_report(self):
        """Generate comprehensive quality report"""
        report = {
            'extraction_timestamp': datetime.now().isoformat(),
            'record_counts': self.record_counts,
            'data_quality': self.data_quality_report,
            'checksums': self.checksums
        }

        report_path = f'migration_quality_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=2, default=str)

        logger.info(f'Quality report generated: {report_path}')

    def _export_to_csv(self):
        """Export extracted data to CSV files"""
        export_dir = 'ccrz_export'
        os.makedirs(export_dir, exist_ok=True)

        for object_type, records in self.extracted_data.items():
            if not records:
                continue

            filepath = os.path.join(export_dir, f'{object_type}.csv')

            try:
                with open(filepath, 'w', newline='') as csvfile:
                    fieldnames = set()
                    for record in records:
                        fieldnames.update(record.keys())

                    writer = csv.DictWriter(csvfile, fieldnames=sorted(fieldnames))
                    writer.writeheader()
                    writer.writerows(records)

                logger.info(f'Exported {len(records)} {object_type} records to {filepath}')

            except Exception as e:
                logger.error(f'Failed to export {object_type}: {str(e)}')

    def get_extraction_summary(self) -> Dict[str, Any]:
        """Get summary of extraction"""
        elapsed = (datetime.now() - self.extraction_start_time).total_seconds()

        return {
            'duration_seconds': elapsed,
            'record_counts': self.record_counts,
            'total_records': sum(self.record_counts.values()),
            'data_quality_issues': self.data_quality_report,
            'extraction_status': 'success'
        }


def main():
    """Main execution"""
    extractor = CCRZDataExtractor('migration-config.yaml')

    # In production, credentials would come from secure environment variables
    # or AWS Secrets Manager
    if not extractor.connect_to_org(
        client_id=os.getenv('SALESFORCE_CLIENT_ID'),
        client_secret=os.getenv('SALESFORCE_CLIENT_SECRET'),
        username=os.getenv('SALESFORCE_USERNAME'),
        password=os.getenv('SALESFORCE_PASSWORD')
    ):
        logger.error('Failed to connect to Salesforce')
        return

    if extractor.extract_all_data():
        summary = extractor.get_extraction_summary()
        logger.info(f'Extraction summary: {json.dumps(summary, indent=2, default=str)}')
    else:
        logger.error('Extraction failed')


if __name__ == '__main__':
    main()
