#!/usr/bin/env python3
"""
Pool Temperature and Humidity Data Analysis
==========================================
Processes two CSV files, averages readings by minute, and creates overlay plots
"""

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import numpy as np
import os

def load_and_process_data(file1, file2):
    """Load and process both CSV files"""
    
    # Read the CSV files with proper handling
    print(f"Loading {file1}...")
    df1 = pd.read_csv(file1, skipinitialspace=True)
    
    print(f"Loading {file2}...")
    df2 = pd.read_csv(file2, skipinitialspace=True)
    
    # Clean the data - remove any rows where DATE or TIME might be invalid
    df1 = df1[df1['DATE'].str.contains(r'^\d{4}-\d{2}-\d{2}$', na=False)]
    df2 = df2[df2['DATE'].str.contains(r'^\d{4}-\d{2}-\d{2}$', na=False)]
    
    # Convert numeric columns to float
    numeric_cols = ['ATH_T', 'ATH_H', 'BMP_T', 'BMP_P', 'BMP_SLP', 'ALT', 'AD']
    for col in numeric_cols:
        if col in df1.columns:
            df1[col] = pd.to_numeric(df1[col], errors='coerce')
        if col in df2.columns:
            df2[col] = pd.to_numeric(df2[col], errors='coerce')
    
    # Add source labels
    df1['Source'] = 'Pool2'
    df2['Source'] = 'Pool'
    
    # Combine DataFrames
    df = pd.concat([df1, df2], ignore_index=True)
    
    print(f"Combined data: {len(df)} total records")
    print(f"Pool data: {len(df2)} records")
    print(f"Pool2 data: {len(df1)} records")
    
    return df

def round_to_minute(df):
    """Round timestamps to nearest minute and average readings"""
    
    # Create datetime column with error handling
    try:
        df['DateTime'] = pd.to_datetime(df['DATE'] + ' ' + df['TIME'], errors='coerce')
    except:
        # Alternative parsing if format is mixed
        df['DateTime'] = pd.to_datetime(df['DATE'] + ' ' + df['TIME'], format='mixed', errors='coerce')
    
    # Remove rows where datetime parsing failed
    df = df.dropna(subset=['DateTime'])
    
    # Round to nearest minute
    df['MinuteRounded'] = df['DateTime'].dt.round('min')
    
    # Group by source and rounded minute, then average
    numeric_cols = ['ATH_T', 'ATH_H', 'BMP_T', 'BMP_P', 'BMP_SLP', 'ALT', 'AD']
    
    averaged_df = df.groupby(['Source', 'MinuteRounded']).agg({
        **{col: 'mean' for col in numeric_cols},
        'DATE': 'first',
        'TIME': 'first'
    }).reset_index()
    
    print(f"After averaging by minute: {len(averaged_df)} records")
    
    return averaged_df

def create_overlay_plots(df):
    """Create overlay plots for temperatures and humidity - only for matching minutes"""
    
    # Find common minutes between both sources
    pool_minutes = set(df[df['Source'] == 'Pool']['MinuteRounded'])
    pool2_minutes = set(df[df['Source'] == 'Pool2']['MinuteRounded'])
    common_minutes = pool_minutes.intersection(pool2_minutes)
    
    print(f"Common minutes found: {len(common_minutes)}")
    print(f"Pool minutes: {len(pool_minutes)}, Pool2 minutes: {len(pool2_minutes)}")
    
    if len(common_minutes) == 0:
        print("❌ No overlapping time periods found between datasets!")
        return None
    
    # Filter data to only common minutes
    df_common = df[df['MinuteRounded'].isin(common_minutes)].copy()
    
    # Pivot data for easier plotting
    temp_ath = df_common.pivot(index='MinuteRounded', columns='Source', values='ATH_T')
    temp_bmp = df_common.pivot(index='MinuteRounded', columns='Source', values='BMP_T')
    humidity = df_common.pivot(index='MinuteRounded', columns='Source', values='ATH_H')
    
    # Create figure with 2 subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
    
    # Color schemes
    pool_colors = {'temp': '#1f77b4', 'humid': '#ff7f0e'}  # Blue, Orange
    pool2_colors = {'temp': '#2ca02c', 'humid': '#d62728'}  # Green, Red
    
    # Temperature Plot - only plot if both sources have data
    if 'Pool' in temp_ath.columns and 'Pool2' in temp_ath.columns:
        # Remove rows where either source is missing
        temp_ath_clean = temp_ath.dropna()
        temp_bmp_clean = temp_bmp.dropna()
        
        if not temp_ath_clean.empty:
            ax1.plot(temp_ath_clean.index, temp_ath_clean['Pool'], 
                    'o-', label='Pool ATH_T', color=pool_colors['temp'], alpha=0.8, markersize=4)
            ax1.plot(temp_ath_clean.index, temp_ath_clean['Pool2'], 
                    'o-', label='Pool2 ATH_T', color=pool2_colors['temp'], alpha=0.8, markersize=4)
        
        if not temp_bmp_clean.empty:
            ax1.plot(temp_bmp_clean.index, temp_bmp_clean['Pool'], 
                    's--', label='Pool BMP_T', color=pool_colors['temp'], alpha=0.6, markersize=3)
            ax1.plot(temp_bmp_clean.index, temp_bmp_clean['Pool2'], 
                    's--', label='Pool2 BMP_T', color=pool2_colors['temp'], alpha=0.6, markersize=3)
    
    ax1.set_title(f'Temperature Comparison: Synchronized Data ({len(common_minutes)} matching minutes)', 
                  fontsize=16, fontweight='bold')
    ax1.set_ylabel('Temperature (°C)', fontsize=12)
    ax1.grid(True, alpha=0.3)
    ax1.legend(loc='upper right')
    
    # Format x-axis for temperature plot
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d %H:%M'))
    ax1.xaxis.set_major_locator(mdates.HourLocator(interval=2))
    plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45)
    
    # Humidity Plot - only plot if both sources have data
    if 'Pool' in humidity.columns and 'Pool2' in humidity.columns:
        humidity_clean = humidity.dropna()
        
        if not humidity_clean.empty:
            ax2.plot(humidity_clean.index, humidity_clean['Pool'], 
                    'o-', label='Pool Humidity', color=pool_colors['humid'], alpha=0.8, markersize=4)
            ax2.plot(humidity_clean.index, humidity_clean['Pool2'], 
                    'o-', label='Pool2 Humidity', color=pool2_colors['humid'], alpha=0.8, markersize=4)
    
    ax2.set_title(f'Humidity Comparison: Synchronized Data ({len(common_minutes)} matching minutes)', 
                  fontsize=16, fontweight='bold')
    ax2.set_xlabel('Time', fontsize=12)
    ax2.set_ylabel('Humidity (%)', fontsize=12)
    ax2.grid(True, alpha=0.3)
    ax2.legend(loc='upper right')
    
    # Format x-axis for humidity plot
    ax2.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d %H:%M'))
    ax2.xaxis.set_major_locator(mdates.HourLocator(interval=2))
    plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)
    
    plt.tight_layout()
    
    # Save plot
    output_file = 'pool_temp_humidity_synchronized.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    print(f"Synchronized plot saved as: {output_file}")
    
    return fig

def print_statistics(df):
    """Print summary statistics"""
    
    print("\\n=== SUMMARY STATISTICS ===")
    
    for source in df['Source'].unique():
        source_data = df[df['Source'] == source]
        print(f"\\n{source.upper()} DATA:")
        print(f"  Records: {len(source_data)}")
        print(f"  Time Range: {source_data['MinuteRounded'].min()} to {source_data['MinuteRounded'].max()}")
        print(f"  ATH Temperature: {source_data['ATH_T'].mean():.1f}°C ± {source_data['ATH_T'].std():.1f}")
        print(f"  BMP Temperature: {source_data['BMP_T'].mean():.1f}°C ± {source_data['BMP_T'].std():.1f}")  
        print(f"  Humidity: {source_data['ATH_H'].mean():.1f}% ± {source_data['ATH_H'].std():.1f}")
        print(f"  Pressure: {source_data['BMP_P'].mean():.1f} hPa ± {source_data['BMP_P'].std():.1f}")

def main():
    """Main processing function"""
    
    # File paths (relative to /var/www/html/tmp)
    file1 = "../pool2/temps_humid_pressure.csv"
    file2 = "../pool/temps_humid_pressure.csv"
    
    # Check if files exist
    for file in [file1, file2]:
        if not os.path.exists(file):
            print(f"Error: File not found: {file}")
            return
    
    try:
        # Load and process data
        df = load_and_process_data(file1, file2)
        
        # Round to minutes and average
        averaged_df = round_to_minute(df)
        
        # Create overlay plots
        create_overlay_plots(averaged_df)
        
        # Print statistics
        print_statistics(averaged_df)
        
        # Save processed data
        output_csv = 'processed_pool_data_by_minute.csv'
        averaged_df.to_csv(output_csv, index=False)
        print(f"\\nProcessed data saved as: {output_csv}")
        
        print(f"\\n✅ Analysis complete!")
        print(f"   - Plot: pool_temp_humidity_comparison.png")
        print(f"   - Data: {output_csv}")
        
    except Exception as e:
        print(f"Error processing data: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()