Intermediate Level

Computer Vision projects

Lernen Sie, wie Computer "sehen" und Bilder verstehen können. Von grundlegender Bildverarbeitung bis hin zu komplexen Deep Learning-Modellen für Objekterkennung und Klassifikation.

30 min Lesezeit

Hands-on projects

projects in diesem Guide

•Bildklassifikation mit CNNs
•Objekterkennung mit YOLO
•Gesichtserkennung System

•Bildverbesserung mit OpenCV
•Style Transfer mit Neural Networks
•Real-time Video Processing

Projekt 1: Bildklassifikation mit CNNs

Erstellen Sie ein Convolutional Neural Network, das Bilder automatisch in verschiedene Kategorien klassifiziert.

🎯 Projekt-Ziel

Ein CNN-Modell trainieren, das Bilder von Hunden und Katzen mit >90% Genauigkeit unterscheiden kann.

Setup und Datenvorverarbeitung

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
import numpy as np

# GPU-Support prüfen
print("GPU available:", tf.config.list_physical_devices('GPU'))

# Datenaugmentation für bessere Generalisierung
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    zoom_range=0.2,
    validation_split=0.2
)

test_datagen = ImageDataGenerator(rescale=1./255)

# Daten laden (angenommen: dogs-vs-cats Dataset)
train_generator = train_datagen.flow_from_directory(
    'data/train',
    target_size=(150, 150),
    batch_size=32,
    class_mode='binary',
    subset='training'
)

validation_generator = train_datagen.flow_from_directory(
    'data/train',
    target_size=(150, 150),
    batch_size=32,
    class_mode='binary',
    subset='validation'
)

print(f"Gefundene Klassen: {train_generator.class_indices}")
print(f"Trainingsbilder: {train_generator.samples}")
print(f"Validierungsbilder: {validation_generator.samples}")

CNN-Modell erstellen

# CNN-Architektur definieren
model = models.Sequential([
    # Erste Convolution-Schicht
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    layers.MaxPooling2D(2, 2),
    
    # Zweite Convolution-Schicht
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    
    # Dritte Convolution-Schicht
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    
    # Vierte Convolution-Schicht
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    
    # Flatten für Dense Layer
    layers.Flatten(),
    
    # Dropout gegen Overfitting
    layers.Dropout(0.5),
    
    # Dense Layer
    layers.Dense(512, activation='relu'),
    
    # Output Layer (binäre Klassifikation)
    layers.Dense(1, activation='sigmoid')
])

# Modell kompilieren
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Modell-Architektur anzeigen
model.summary()

Training und Evaluation

# Callbacks für besseres Training
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=5,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=3,
        min_lr=0.0001
    )
]

# Modell trainieren
history = model.fit(
    train_generator,
    epochs=30,
    validation_data=validation_generator,
    callbacks=callbacks,
    verbose=1
)

# Trainingsverlauf visualisieren
def plot_training_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Accuracy
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    
    # Loss
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

plot_training_history(history)

# Modell speichern
model.save('dogs_vs_cats_model.h5')
print("Modell gespeichert!")

Projekt 2: Objekterkennung mit YOLO

Implementieren Sie ein YOLO (You Only Look Once) System für Real-time Objekterkennung in Bildern und Videos.

🚀 Was Sie lernen

YOLO-Architektur verstehen, vortrainierte Modelle nutzen und eigene Objekterkennungssysteme entwickeln.

YOLO mit OpenCV implementieren

import cv2
import numpy as np
import argparse

class YOLODetector:
    def __init__(self, weights_path, config_path, names_path):
        # YOLO-Netzwerk laden
        self.net = cv2.dnn.readNet(weights_path, config_path)
        
        # Klassennamen laden
        with open(names_path, 'r') as f:
            self.classes = [line.strip() for line in f.readlines()]
        
        # Output-Layer bestimmen
        layer_names = self.net.getLayerNames()
        self.output_layers = [layer_names[i[0] - 1] for i in self.net.getUnconnectedOutLayers()]
        
        # Farben für Bounding Boxes
        self.colors = np.random.uniform(0, 255, size=(len(self.classes), 3))
    
    def detect_objects(self, image, confidence_threshold=0.5, nms_threshold=0.4):
        height, width, channels = image.shape
        
        # Bild für YOLO vorbereiten
        blob = cv2.dnn.blobFromImage(
            image, scalefactor=1/255.0, size=(416, 416), 
            mean=(0, 0, 0), swapRB=True, crop=False
        )
        
        # Forward Pass
        self.net.setInput(blob)
        outputs = self.net.forward(self.output_layers)
        
        # Erkennungen verarbeiten
        boxes, confidences, class_ids = [], [], []
        
        for output in outputs:
            for detection in output:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]
                
                if confidence > confidence_threshold:
                    # Bounding Box Koordinaten
                    center_x = int(detection[0] * width)
                    center_y = int(detection[1] * height)
                    w = int(detection[2] * width)
                    h = int(detection[3] * height)
                    
                    # Box-Koordinaten berechnen
                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)
                    
                    boxes.append([x, y, w, h])
                    confidences.append(float(confidence))
                    class_ids.append(class_id)
        
        # Non-Maximum Suppression
        indices = cv2.dnn.NMSBoxes(boxes, confidences, confidence_threshold, nms_threshold)
        
        return boxes, confidences, class_ids, indices
    
    def draw_detections(self, image, boxes, confidences, class_ids, indices):
        if len(indices) > 0:
            for i in indices.flatten():
                x, y, w, h = boxes[i]
                
                # Klasse und Konfidenz
                label = f"{self.classes[class_ids[i]]}: {confidences[i]:.2f}"
                color = self.colors[class_ids[i]]
                
                # Bounding Box zeichnen
                cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
                cv2.putText(
                    image, label, (x, y - 10), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2
                )
        
        return image

use YOLO for video stream

# YOLO-Detektor initialisieren
detector = YOLODetector(
    weights_path='yolov3.weights',
    config_path='yolov3.cfg',
    names_path='coco.names'
)

def process_video(source=0):  # 0 für Webcam, oder Pfad zu Video
    cap = cv2.VideoCapture(source)
    
    # Video-Properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Video-Writer für Output (optional)
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter('output.avi', fourcc, fps, (width, height))
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Objekterkennung
        boxes, confidences, class_ids, indices = detector.detect_objects(frame)
        
        # Erkennungen zeichnen
        result_frame = detector.draw_detections(
            frame.copy(), boxes, confidences, class_ids, indices
        )
        
        # FPS anzeigen
        cv2.putText(
            result_frame, f'FPS: {fps}', (10, 30),
            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2
        )
        
        # Frame anzeigen
        cv2.imshow('YOLO Object Detection', result_frame)
        
        # Video speichern (optional)
        out.write(result_frame)
        
        # Beenden mit 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    # Aufräumen
    cap.release()
    out.release()
    cv2.destroyAllWindows()

# Video-Processing starten
process_video(0)  # Webcam verwenden

Projekt 3: Gesichtserkennung System

Entwickeln Sie ein komplettes Gesichtserkennungssystem mit Gesichtserkennung, -encoding und -verifikation.

🔒 Anwendungen

Secureheitssysteme, Anwesenheitskontrolle, personalisierte Benutzererfahrungen.

Gesichtserkennung mit face_recognition

import face_recognition
import cv2
import numpy as np
import os
import pickle
from datetime import datetime

class FaceRecognitionSystem:
    def __init__(self):
        self.known_face_encodings = []
        self.known_face_names = []
        self.face_locations = []
        self.face_encodings = []
        self.face_names = []
        
    def load_known_faces(self, faces_dir):
        """Bekannte Gesichter aus Ordner laden"""
        print("Lade bekannte Gesichter...")
        
        for filename in os.listdir(faces_dir):
            if filename.endswith(('.jpg', '.jpeg', '.png')):
                # Bild laden
                image_path = os.path.join(faces_dir, filename)
                image = face_recognition.load_image_file(image_path)
                
                # Gesichts-Encoding extrahieren
                face_encodings = face_recognition.face_encodings(image)
                
                if face_encodings:
                    # Name aus Dateiname extrahieren
                    name = os.path.splitext(filename)[0]
                    
                    self.known_face_encodings.append(face_encodings[0])
                    self.known_face_names.append(name)
                    print(f"Gesicht geladen: {name}")
                else:
                    print(f"Kein Gesicht gefunden in: {filename}")
    
    def save_encodings(self, filepath):
        """Encodings speichern"""
        data = {
            'encodings': self.known_face_encodings,
            'names': self.known_face_names
        }
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
        print(f"Encodings gespeichert in: {filepath}")
    
    def load_encodings(self, filepath):
        """Encodings laden"""
        try:
            with open(filepath, 'rb') as f:
                data = pickle.load(f)
            self.known_face_encodings = data['encodings']
            self.known_face_names = data['names']
            print(f"Encodings geladen von: {filepath}")
            return True
        except FileNotFoundError:
            print(f"Encodings-Datei nicht gefunden: {filepath}")
            return False
    
    def recognize_faces_in_image(self, image):
        """Gesichter in einem Bild erkennen"""
        # Bild von BGR zu RGB konvertieren
        rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Gesichter finden und encodieren
        face_locations = face_recognition.face_locations(rgb_image)
        face_encodings = face_recognition.face_encodings(rgb_image, face_locations)
        
        face_names = []
        for face_encoding in face_encodings:
            # Gesicht mit bekannten Gesichtern vergleichen
            matches = face_recognition.compare_faces(
                self.known_face_encodings, face_encoding, tolerance=0.6
            )
            name = "Unbekannt"
            
            # Beste Übereinstimmung finden
            face_distances = face_recognition.face_distance(
                self.known_face_encodings, face_encoding
            )
            
            if matches and len(face_distances) > 0:
                best_match_index = np.argmin(face_distances)
                if matches[best_match_index]:
                    name = self.known_face_names[best_match_index]
            
            face_names.append(name)
        
        return face_locations, face_names

Real-time Gesichtserkennung

def run_face_recognition():
    # System initialisieren
    fr_system = FaceRecognitionSystem()
    
    # Bekannte Gesichter laden oder Encodings laden
    if not fr_system.load_encodings('face_encodings.pkl'):
        fr_system.load_known_faces('known_faces/')
        fr_system.save_encodings('face_encodings.pkl')
    
    # Webcam initialisieren
    video_capture = cv2.VideoCapture(0)
    
    # optimization: Nur jedes n-te Frame verarbeiten
    process_this_frame = True
    frame_count = 0
    
    # Anwesenheitsliste
    attendance_log = set()
    
    while True:
        ret, frame = video_capture.read()
        
        # Frame-Größe reduzieren für bessere Performance
        small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
        
        # Nur jedes zweite Frame verarbeiten
        if process_this_frame:
            # Gesichtserkennung durchführen
            face_locations, face_names = fr_system.recognize_faces_in_image(small_frame)
            
            # Koordinaten zurückskalieren
            face_locations = [(top*4, right*4, bottom*4, left*4) 
                            for (top, right, bottom, left) in face_locations]
        
        process_this_frame = not process_this_frame
        
        # Ergebnisse zeichnen
        for (top, right, bottom, left), name in zip(face_locations, face_names):
            # Bounding Box
            color = (0, 255, 0) if name != "Unbekannt" else (0, 0, 255)
            cv2.rectangle(frame, (left, top), (right, bottom), color, 2)
            
            # Label
            cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED)
            cv2.putText(
                frame, name, (left + 6, bottom - 6),
                cv2.FONT_HERSHEY_DUPLEX, 0.6, (255, 255, 255), 1
            )
            
            # Anwesenheit loggen
            if name != "Unbekannt" and name not in attendance_log:
                attendance_log.add(name)
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                print(f"Anwesenheit: {name} um {timestamp}")
        
        # FPS und Frame-Info anzeigen
        frame_count += 1
        cv2.putText(
            frame, f'Frame: {frame_count}', (10, 30),
            cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2
        )
        
        cv2.putText(
            frame, f'Anwesend: {len(attendance_log)}', (10, 60),
            cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2
        )
        
        # Frame anzeigen
        cv2.imshow('Gesichtserkennung', frame)
        
        # Beenden mit 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    # Aufräumen
    video_capture.release()
    cv2.destroyAllWindows()
    
    # Finale Anwesenheitsliste ausgeben
    print("\nFinale Anwesenheitsliste:")
    for person in attendance_log:
        print(f"- {person}")

# System starten
if __name__ == "__main__":
    run_face_recognition()

Erweiterte Computer Vision Techniken

Style Transfer

Übertragen Sie den Stil eines Kunstwerks auf Ihre eigenen Bilder mit neuronalen Netzwerken.

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

# Pre-trained Style Transfer Modell laden
model = hub.load('https://tfhub.dev/google/magenta/arbitrary-image-stylization-v1-256/2')

def load_and_preprocess_image(path, max_dim=512):
    image = tf.io.read_file(path)
    image = tf.image.decode_image(image, channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)
    
    # Größe anpassen
    shape = tf.cast(tf.shape(image)[:-1], tf.float32)
    long_dim = max(shape)
    scale = max_dim / long_dim
    new_shape = tf.cast(shape * scale, tf.int32)
    image = tf.image.resize(image, new_shape)
    image = image[tf.newaxis, :]
    
    return image

def apply_style_transfer(content_path, style_path):
    # Bilder laden
    content_image = load_and_preprocess_image(content_path)
    style_image = load_and_preprocess_image(style_path)
    
    # Style Transfer anwenden
    stylized_image = model(tf.constant(content_image), tf.constant(style_image))[0]
    
    return stylized_image

# Beispiel-Verwendung
content_path = 'content_image.jpg'
style_path = 'style_image.jpg'

stylized = apply_style_transfer(content_path, style_path)

# Ergebnis anzeigen
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.imshow(load_and_preprocess_image(content_path)[0])
plt.title('Content')
plt.axis('off')

plt.subplot(1, 3, 2)
plt.imshow(load_and_preprocess_image(style_path)[0])
plt.title('Style')
plt.axis('off')

plt.subplot(1, 3, 3)
plt.imshow(stylized[0])
plt.title('Stylized')
plt.axis('off')

plt.show()

Image Segmentation

Segmentieren Sie Bilder auf Pixel-Ebene für präzise Objekterkennung.

import cv2
import numpy as np
from sklearn.cluster import KMeans

def semantic_segmentation_kmeans(image, k=3):
    """Simplee Segmentierung mit K-Means Clustering"""
    # Bild in 2D Array umwandeln
    data = image.reshape((-1, 3))
    data = np.float32(data)
    
    # K-Means Clustering
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 20, 1.0)
    _, labels, centers = cv2.kmeans(data, k, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
    
    # Cluster-Zentren zu uint8 konvertieren
    centers = np.uint8(centers)
    
    # Segmentiertes Bild erstellen
    segmented_data = centers[labels.flatten()]
    segmented_image = segmented_data.reshape(image.shape)
    
    return segmented_image, labels.reshape(image.shape[:2])

def watershed_segmentation(image):
    """Watershed algorithm for object segmentation"""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Noise entfernen
    ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # Morphologische Operationen
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)
    
    # Hintergrund bestimmen
    sure_bg = cv2.dilate(opening, kernel, iterations=3)
    
    # Vordergrund bestimmen
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    ret, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)
    
    # Unbekannte Region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)
    
    # Marker erstellen
    ret, markers = cv2.connectedComponents(sure_fg)
    markers = markers + 1
    markers[unknown == 255] = 0
    
    # Watershed anwenden
    markers = cv2.watershed(image, markers)
    image[markers == -1] = [255, 0, 0]  # Grenzen rot markieren
    
    return image, markers

# Beispiel-Usage
image = cv2.imread('example.jpg')

# K-Means Segmentierung
segmented_kmeans, labels = semantic_segmentation_kmeans(image, k=4)

# Watershed Segmentierung  
segmented_watershed, markers = watershed_segmentation(image.copy())

# Ergebnisse anzeigen
cv2.imshow('Original', image)
cv2.imshow('K-Means Segmentation', segmented_kmeans)
cv2.imshow('Watershed Segmentation', segmented_watershed)
cv2.waitKey(0)
cv2.destroyAllWindows()

Performance-optimization

⚡GPU acceleration: Use CUDA and OpenCL for intensive calculations
⚡Reduce image size: Smaller resolutions for real-time processing
⚡Model Optimization: TensorRT, ONNX for deployment optimization
⚡Batch Processing: Mehrere Bilder gleichzeitig verarbeiten

Further projects

Fortgeschrittene projects

→3D Objekterkennung und Tracking
→Augmented Reality Anwendungen
→Medizinische Bildanalyse
→Autonomous Vehicle Vision

Empfohlene Tools

•OpenCV für klassische CV
•TensorFlow/PyTorch für Deep Learning
•Detectron2 für Objekterkennung
•MediaPipe für Real-time CV

Inhaltsverzeichnis

Projekt-Status

BildklassifikationKomplett

YOLO DetectionKomplett

GesichtserkennungKomplett

Style TransferBonus

Voraussetzungen

✓Python 3.7+

✓OpenCV, TensorFlow

✓GPU (empfohlen)

○ML-Grundkenntnisse