Recod.ai/LUC - Scientific Image Forgery Detection EDA

This notebook performs exploratory data analysis with:

Auto path detection on Kaggle
Safe mask loading (.png/.npy, multi-masks → union)
Strong guards for empty sets / all-zero coverage
English-only plot titles/labels to avoid font issues

0) Imports & Configuration

import os, re, math, random
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["axes.grid"] = True

try:
    import seaborn as sns
    sns.set_context("talk")
except Exception:
    sns = None

from PIL import Image

try:
    from skimage.measure import label as sk_label
except Exception:
    sk_label = None

try:
    import cv2
except Exception:
    cv2 = None

SEED = 42
random.seed(SEED); np.random.seed(SEED)

SLOW_FEATURE_SAMPLE_MAX = 800  # for heavy ops

1) Locate Competition Paths

def find_competition_root():
    base = Path("/kaggle/input")
    if base.exists():
        cands = [p for p in base.iterdir()
                 if p.is_dir() and "recodai-luc-scientific-image-forgery-detection" in p.name]
        if len(cands) > 0:
            return cands[0]
    return Path("./recodai-luc-scientific-image-forgery-detection")

COMP_ROOT = find_competition_root()
print(f"[PATH] COMP_ROOT = {COMP_ROOT}")

DIR_TRAIN_IMG = COMP_ROOT/"train_images"
DIR_TRAIN_MASK = COMP_ROOT/"train_masks"
DIR_SUP_IMG = COMP_ROOT/"supplemental_images"
DIR_SUP_MASK = COMP_ROOT/"supplemental_masks"
DIR_TEST_IMG = COMP_ROOT/"test_images"

for p in [DIR_TRAIN_IMG, DIR_TRAIN_MASK, DIR_SUP_IMG, DIR_SUP_MASK, DIR_TEST_IMG]:
    print(f"  - {p}: {'OK' if p.exists() else 'Missing'}")

2) IO Utilities (Robust)

IMG_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}
MASK_EXTS = {".png", ".npy"}

def get_case_id_from_path(path: Path):
    m = re.search(r"\\d+", path.stem)
    return int(m.group()) if m else path.stem

def read_image(path: Path):
    # Force RGB (3 channels) so overlays are consistent
    img = Image.open(path).convert("RGB")
    return np.array(img)  # HWC, uint8

def read_mask_file(path: Path):
    # Single mask loader (.png/.npy) -> returns (H,W) uint8 in {0,1}
    if path.suffix.lower() == ".npy":
        m = np.load(path)
        if m.ndim == 3 and m.shape[0] == 1:
            m = m[0]
        if m.ndim == 3 and m.shape[-1] == 1:
            m = m[..., 0]
        return (m > 0).astype(np.uint8)
    else:
        if cv2 is not None:
            m = cv2.imread(str(path), cv2.IMREAD_UNCHANGED)
            if m is None:
                m = np.array(Image.open(path).convert("L"))
            elif m.ndim == 3:
                m = cv2.cvtColor(m, cv2.COLOR_BGR2GRAY)
        else:
            m = np.array(Image.open(path).convert("L"))
        return (m > 0).astype(np.uint8)

def list_mask_files_for_case(case_id, mask_dir: Path):
    if not mask_dir.exists():
        return []
    pat1 = list(mask_dir.glob(f"{case_id}.*"))
    pat2 = list(mask_dir.glob(f"{case_id}_*.*"))
    pat3 = [p for p in mask_dir.glob("*.*")
            if p.suffix.lower() in MASK_EXTS and re.search(rf"\\b{case_id}\\b", p.stem)]
    files = []
    for c in (pat1 + pat2 + pat3):
        if c.suffix.lower() in MASK_EXTS:
            files.append(c)
    files = sorted({str(p): p for p in files}.values())
    return files

def read_mask_union_for_case(case_id, target_hw=None):
    files = []
    files += list_mask_files_for_case(case_id, DIR_TRAIN_MASK)
    files += list_mask_files_for_case(case_id, DIR_SUP_MASK)
    if len(files) == 0:
        return None, 0
    masks = []
    for f in files:
        m = read_mask_file(f)
        if target_hw is not None and (m.shape[0], m.shape[1]) != tuple(target_hw):
            if cv2 is not None:
                m = cv2.resize(m, (target_hw[1], target_hw[0]), interpolation=cv2.INTER_NEAREST)
            else:
                m = np.array(Image.fromarray(m).resize((target_hw[1], target_hw[0]),
                                                       resample=Image.NEAREST))
        masks.append((m > 0).astype(np.uint8))
    union = np.zeros_like(masks[0], dtype=np.uint8)
    for m in masks:
        union = np.maximum(union, m)
    return union, len(files)

def connected_components_count(binary_mask: np.ndarray):
    if binary_mask is None or binary_mask.max() == 0:
        return 0
    if sk_label is not None:
        return sk_label(binary_mask, connectivity=2).max()
    if cv2 is not None:
        num, _ = cv2.connectedComponents((binary_mask > 0).astype(np.uint8), connectivity=8)
        return max(0, num - 1)
    return int((binary_mask > 0).sum() > 0)

def overlay_mask(img: np.ndarray, mask: np.ndarray, alpha=0.45, color=(255, 0, 0)):
    # Robust channel-safe overlay using np.where (no broadcasting error)
    if mask is None or mask.max() == 0:
        return img
    over = img.astype(np.float32)
    m = (mask > 0)
    color_arr = np.array(color, dtype=np.float32).reshape(1, 1, 3)
    over = np.where(m[..., None], over*(1.0 - alpha) + color_arr*alpha, over)
    return np.clip(over, 0, 255).astype(np.uint8)

3) Build Metadata Table

def collect_images(root: Path, split_name: str):
    if not root.exists():
        return []
    paths = []
    for ext in IMG_EXTS:
        paths += list(root.glob(f"*{ext}"))
    recs = []
    for p in sorted(paths, key=lambda x: (len(x.stem), x.stem)):
        cid = get_case_id_from_path(p)
        recs.append({"case_id": cid, "img_path": p, "split": split_name})
    return recs

records = []
records += collect_images(DIR_TRAIN_IMG, "train")
records += collect_images(DIR_SUP_IMG, "supplemental")
records += collect_images(DIR_TEST_IMG, "test")

meta = pd.DataFrame.from_records(records)
print(f"[META] images found: {len(meta)}")
if len(meta) == 0:
    raise RuntimeError("No images are found. Check paths.")

def fast_probe(path: Path):
    try:
        with Image.open(path) as im:
            im = im.convert("RGB")
            w, h = im.size
            mode = "RGB"
        fsize = path.stat().st_size / (1024**2)
        return pd.Series({"width": w, "height": h, "mode": mode, "filesize_mb": fsize})
    except Exception:
        return pd.Series({"width": np.nan, "height": np.nan, "mode": "ERR", "filesize_mb": np.nan})

meta = pd.concat([meta, meta["img_path"].apply(fast_probe)], axis=1)

def compute_mask_stats(row):
    if row["split"] == "test":
        return pd.Series({"mask_count": 0, "coverage": 0.0, "n_comp": 0})
    try:
        H, W = int(row["height"]), int(row["width"])
        union, mcount = read_mask_union_for_case(row["case_id"], target_hw=(H, W))
        if union is None:
            return pd.Series({"mask_count": 0, "coverage": 0.0, "n_comp": 0})
        area = (union > 0).sum()
        cov = float(area) / float(max(1, H*W))
        ncomp = connected_components_count(union)
        return pd.Series({"mask_count": mcount, "coverage": cov, "n_comp": ncomp})
    except Exception:
        return pd.Series({"mask_count": 0, "coverage": 0.0, "n_comp": 0})

mask_stats = meta.apply(compute_mask_stats, axis=1)
meta = pd.concat([meta, mask_stats], axis=1)
for col in ["mask_count", "coverage", "n_comp"]:
    if col in meta.columns:
        meta[col] = meta[col].fillna(0)
meta["is_forged"] = (meta["mask_count"] > 0).astype(int)

print("\\n[HEAD]\\n", meta.head())
print("\\n[Split counts]\\n", meta["split"].value_counts(dropna=False))
print("\\n[Top resolutions]\\n", meta[["width","height"]].value_counts().head())

Recod.ai/LUC - Scientific Image Forgery Detection EDA

0) Imports & Configuration

1) Locate Competition Paths

2) IO Utilities (Robust)

3) Build Metadata Table

4) Quick Distributions (English Labels)