# ------------------------------------------------------------------------------
#  NetFlow CSV Loader
#
#  © 2025 Vojtěch Drahý
#  All rights reserved.
#
#  This software is provided "as is", without warranty of any kind,
#  express or implied, including but not limited to the warranties of
#  merchantability, fitness for a particular purpose and noninfringement.
#  In no event shall the authors or copyright holders be liable for any claim,
#  damages or other liability, whether in an action of contract, tort or otherwise,
#  arising from, out of or in connection with the software or the use or other
#  dealings in the software.
# ------------------------------------------------------------------------------

import csv
from dataclasses import dataclass
from typing import List, Set, Optional, Tuple


@dataclass(frozen=True)
class FlowRecord:
    """
    Represents a single NetFlow communication record.

    Attributes:
        entry_id (int): ID of record
        src_ip (str): Source IP address.
        dst_ip (str): Destination IP address.
        protocol (int): IP protocol number (e.g., 6 = TCP, 17 = UDP).
        src_port (int): Source port number.
        dst_port (int): Destination port number.
    """
    entry_id: int
    src_ip: str
    dst_ip: str
    protocol: int
    src_port: int
    dst_port: int


def load_netflow_csv(
    files: List[str],
    protocol_filter: Optional[List[int]] = None
) -> Tuple[Set[FlowRecord], int, int]:
    """
    Load NetFlow CSV files and extract communication flow records.

    This function reads one or more NetFlow CSV files, filters by protocol (if specified),
    and builds a set of unique flow records containing source/destination IPs and ports.

    Args:
        files (List[str]): List of file paths pointing to NetFlow CSV files.
        protocol_filter (Optional[List[int]]): If provided, only records matching
            one of these protocol numbers will be analyzed (e.g., [6, 17]).
            If None, all protocols are included.

    Returns:
        Tuple[Set[FlowRecord], int, int]:
            - Set of unique FlowRecord objects.
            - Total number of entries processed (after filtering).
            - Total number of packets across all records.
    """
    flows: Set[FlowRecord] = set()
    total_entries = 0
    total_packets = 0

    for file_path in files:
        with open(file_path, "r", newline="") as csvfile:
            reader = csv.reader(csvfile)
            next(reader, None)  # Skip header

            for row in reader:
                protocol = int(row[17])  # 'prot' field

                # Apply optional protocol filtering
                if protocol_filter is not None and protocol not in protocol_filter:
                    continue

                total_entries += 1
                total_packets += int(row[4])  # 'dpkts'

                src_ip = row[10]  # srcaddr
                dst_ip = row[11]  # dstaddr
                src_port = int(row[15])  # srcport
                dst_port = int(row[16])  # dstport

                record = FlowRecord(
                    entry_id=total_entries,
                    src_ip=src_ip,
                    dst_ip=dst_ip,
                    protocol=protocol,
                    src_port=src_port,
                    dst_port=dst_port
                )

                flows.add(record)

    return flows, total_entries, total_packets



if __name__ == "__main__":
    import glob

    files = glob.glob("exported-flows/*.csv")

    # Example: load only TCP and UDP flows
    flows, entries, packets = load_netflow_csv(files, protocol_filter=[6, 17])

    print(f"Processed entries: {entries}")
    print(f"Total packets: {packets}")
    print(f"Unique flows: {len(flows)}")

    # Print a sample of flows
    for flow in list(flows)[:5]:
        print(flow)
