This notebook performs exploratory data analysis with:
import os, re, math, random
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["axes.grid"] = True
try:
import seaborn as sns
sns.set_context("talk")
except Exception:
sns = None
from PIL import Image
try:
from skimage.measure import label as sk_label
except Exception:
sk_label = None
try:
import cv2
except Exception:
cv2 = None
SEED = 42
random.seed(SEED); np.random.seed(SEED)
SLOW_FEATURE_SAMPLE_MAX = 800 # for heavy ops
def find_competition_root():
base = Path("/kaggle/input")
if base.exists():
cands = [p for p in base.iterdir()
if p.is_dir() and "recodai-luc-scientific-image-forgery-detection" in p.name]
if len(cands) > 0:
return cands[0]
return Path("./recodai-luc-scientific-image-forgery-detection")
COMP_ROOT = find_competition_root()
print(f"[PATH] COMP_ROOT = {COMP_ROOT}")
DIR_TRAIN_IMG = COMP_ROOT/"train_images"
DIR_TRAIN_MASK = COMP_ROOT/"train_masks"
DIR_SUP_IMG = COMP_ROOT/"supplemental_images"
DIR_SUP_MASK = COMP_ROOT/"supplemental_masks"
DIR_TEST_IMG = COMP_ROOT/"test_images"
for p in [DIR_TRAIN_IMG, DIR_TRAIN_MASK, DIR_SUP_IMG, DIR_SUP_MASK, DIR_TEST_IMG]:
print(f" - {p}: {'OK' if p.exists() else 'Missing'}")
IMG_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}
MASK_EXTS = {".png", ".npy"}
def get_case_id_from_path(path: Path):
m = re.search(r"\\d+", path.stem)
return int(m.group()) if m else path.stem
def read_image(path: Path):
# Force RGB (3 channels) so overlays are consistent
img = Image.open(path).convert("RGB")
return np.array(img) # HWC, uint8
def read_mask_file(path: Path):
# Single mask loader (.png/.npy) -> returns (H,W) uint8 in {0,1}
if path.suffix.lower() == ".npy":
m = np.load(path)
if m.ndim == 3 and m.shape[0] == 1:
m = m[0]
if m.ndim == 3 and m.shape[-1] == 1:
m = m[..., 0]
return (m > 0).astype(np.uint8)
else:
if cv2 is not None:
m = cv2.imread(str(path), cv2.IMREAD_UNCHANGED)
if m is None:
m = np.array(Image.open(path).convert("L"))
elif m.ndim == 3:
m = cv2.cvtColor(m, cv2.COLOR_BGR2GRAY)
else:
m = np.array(Image.open(path).convert("L"))
return (m > 0).astype(np.uint8)
def list_mask_files_for_case(case_id, mask_dir: Path):
if not mask_dir.exists():
return []
pat1 = list(mask_dir.glob(f"{case_id}.*"))
pat2 = list(mask_dir.glob(f"{case_id}_*.*"))
pat3 = [p for p in mask_dir.glob("*.*")
if p.suffix.lower() in MASK_EXTS and re.search(rf"\\b{case_id}\\b", p.stem)]
files = []
for c in (pat1 + pat2 + pat3):
if c.suffix.lower() in MASK_EXTS:
files.append(c)
files = sorted({str(p): p for p in files}.values())
return files
def read_mask_union_for_case(case_id, target_hw=None):
files = []
files += list_mask_files_for_case(case_id, DIR_TRAIN_MASK)
files += list_mask_files_for_case(case_id, DIR_SUP_MASK)
if len(files) == 0:
return None, 0
masks = []
for f in files:
m = read_mask_file(f)
if target_hw is not None and (m.shape[0], m.shape[1]) != tuple(target_hw):
if cv2 is not None:
m = cv2.resize(m, (target_hw[1], target_hw[0]), interpolation=cv2.INTER_NEAREST)
else:
m = np.array(Image.fromarray(m).resize((target_hw[1], target_hw[0]),
resample=Image.NEAREST))
masks.append((m > 0).astype(np.uint8))
union = np.zeros_like(masks[0], dtype=np.uint8)
for m in masks:
union = np.maximum(union, m)
return union, len(files)
def connected_components_count(binary_mask: np.ndarray):
if binary_mask is None or binary_mask.max() == 0:
return 0
if sk_label is not None:
return sk_label(binary_mask, connectivity=2).max()
if cv2 is not None:
num, _ = cv2.connectedComponents((binary_mask > 0).astype(np.uint8), connectivity=8)
return max(0, num - 1)
return int((binary_mask > 0).sum() > 0)
def overlay_mask(img: np.ndarray, mask: np.ndarray, alpha=0.45, color=(255, 0, 0)):
# Robust channel-safe overlay using np.where (no broadcasting error)
if mask is None or mask.max() == 0:
return img
over = img.astype(np.float32)
m = (mask > 0)
color_arr = np.array(color, dtype=np.float32).reshape(1, 1, 3)
over = np.where(m[..., None], over*(1.0 - alpha) + color_arr*alpha, over)
return np.clip(over, 0, 255).astype(np.uint8)
def collect_images(root: Path, split_name: str):
if not root.exists():
return []
paths = []
for ext in IMG_EXTS:
paths += list(root.glob(f"*{ext}"))
recs = []
for p in sorted(paths, key=lambda x: (len(x.stem), x.stem)):
cid = get_case_id_from_path(p)
recs.append({"case_id": cid, "img_path": p, "split": split_name})
return recs
records = []
records += collect_images(DIR_TRAIN_IMG, "train")
records += collect_images(DIR_SUP_IMG, "supplemental")
records += collect_images(DIR_TEST_IMG, "test")
meta = pd.DataFrame.from_records(records)
print(f"[META] images found: {len(meta)}")
if len(meta) == 0:
raise RuntimeError("No images are found. Check paths.")
def fast_probe(path: Path):
try:
with Image.open(path) as im:
im = im.convert("RGB")
w, h = im.size
mode = "RGB"
fsize = path.stat().st_size / (1024**2)
return pd.Series({"width": w, "height": h, "mode": mode, "filesize_mb": fsize})
except Exception:
return pd.Series({"width": np.nan, "height": np.nan, "mode": "ERR", "filesize_mb": np.nan})
meta = pd.concat([meta, meta["img_path"].apply(fast_probe)], axis=1)
def compute_mask_stats(row):
if row["split"] == "test":
return pd.Series({"mask_count": 0, "coverage": 0.0, "n_comp": 0})
try:
H, W = int(row["height"]), int(row["width"])
union, mcount = read_mask_union_for_case(row["case_id"], target_hw=(H, W))
if union is None:
return pd.Series({"mask_count": 0, "coverage": 0.0, "n_comp": 0})
area = (union > 0).sum()
cov = float(area) / float(max(1, H*W))
ncomp = connected_components_count(union)
return pd.Series({"mask_count": mcount, "coverage": cov, "n_comp": ncomp})
except Exception:
return pd.Series({"mask_count": 0, "coverage": 0.0, "n_comp": 0})
mask_stats = meta.apply(compute_mask_stats, axis=1)
meta = pd.concat([meta, mask_stats], axis=1)
for col in ["mask_count", "coverage", "n_comp"]:
if col in meta.columns:
meta[col] = meta[col].fillna(0)
meta["is_forged"] = (meta["mask_count"] > 0).astype(int)
print("\\n[HEAD]\\n", meta.head())
print("\\n[Split counts]\\n", meta["split"].value_counts(dropna=False))
print("\\n[Top resolutions]\\n", meta[["width","height"]].value_counts().head())