Computer Vision Projekte
Lernen Sie, wie Computer "sehen" und Bilder verstehen können. Von grundlegender Bildverarbeitung bis hin zu komplexen Deep Learning-Modellen für Objekterkennung und Klassifikation.
Projekte in diesem Guide
- •Bildklassifikation mit CNNs
- •Objekterkennung mit YOLO
- •Gesichtserkennung System
- •Bildverbesserung mit OpenCV
- •Style Transfer mit Neural Networks
- •Real-time Video Processing
Projekt 1: Bildklassifikation mit CNNs
Erstellen Sie ein Convolutional Neural Network, das Bilder automatisch in verschiedene Kategorien klassifiziert.
🎯 Projekt-Ziel
Ein CNN-Modell trainieren, das Bilder von Hunden und Katzen mit >90% Genauigkeit unterscheiden kann.
Setup und Datenvorverarbeitung
import tensorflow as tf from tensorflow.keras import layers, models from tensorflow.keras.preprocessing.image import ImageDataGenerator import matplotlib.pyplot as plt import numpy as np # GPU-Support prüfen print("GPU available:", tf.config.list_physical_devices('GPU')) # Datenaugmentation für bessere Generalisierung train_datagen = ImageDataGenerator( rescale=1./255, rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, horizontal_flip=True, zoom_range=0.2, validation_split=0.2 ) test_datagen = ImageDataGenerator(rescale=1./255) # Daten laden (angenommen: dogs-vs-cats Dataset) train_generator = train_datagen.flow_from_directory( 'data/train', target_size=(150, 150), batch_size=32, class_mode='binary', subset='training' ) validation_generator = train_datagen.flow_from_directory( 'data/train', target_size=(150, 150), batch_size=32, class_mode='binary', subset='validation' ) print(f"Gefundene Klassen: {train_generator.class_indices}") print(f"Trainingsbilder: {train_generator.samples}") print(f"Validierungsbilder: {validation_generator.samples}")
CNN-Modell erstellen
# CNN-Architektur definieren model = models.Sequential([ # Erste Convolution-Schicht layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)), layers.MaxPooling2D(2, 2), # Zweite Convolution-Schicht layers.Conv2D(64, (3, 3), activation='relu'), layers.MaxPooling2D(2, 2), # Dritte Convolution-Schicht layers.Conv2D(128, (3, 3), activation='relu'), layers.MaxPooling2D(2, 2), # Vierte Convolution-Schicht layers.Conv2D(128, (3, 3), activation='relu'), layers.MaxPooling2D(2, 2), # Flatten für Dense Layer layers.Flatten(), # Dropout gegen Overfitting layers.Dropout(0.5), # Dense Layer layers.Dense(512, activation='relu'), # Output Layer (binäre Klassifikation) layers.Dense(1, activation='sigmoid') ]) # Modell kompilieren model.compile( optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'] ) # Modell-Architektur anzeigen model.summary()
Training und Evaluation
# Callbacks für besseres Training callbacks = [ tf.keras.callbacks.EarlyStopping( monitor='val_accuracy', patience=5, restore_best_weights=True ), tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001 ) ] # Modell trainieren history = model.fit( train_generator, epochs=30, validation_data=validation_generator, callbacks=callbacks, verbose=1 ) # Trainingsverlauf visualisieren def plot_training_history(history): fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) # Accuracy ax1.plot(history.history['accuracy'], label='Training Accuracy') ax1.plot(history.history['val_accuracy'], label='Validation Accuracy') ax1.set_title('Model Accuracy') ax1.set_xlabel('Epoch') ax1.set_ylabel('Accuracy') ax1.legend() # Loss ax2.plot(history.history['loss'], label='Training Loss') ax2.plot(history.history['val_loss'], label='Validation Loss') ax2.set_title('Model Loss') ax2.set_xlabel('Epoch') ax2.set_ylabel('Loss') ax2.legend() plt.tight_layout() plt.show() plot_training_history(history) # Modell speichern model.save('dogs_vs_cats_model.h5') print("Modell gespeichert!")
Projekt 2: Objekterkennung mit YOLO
Implementieren Sie ein YOLO (You Only Look Once) System für Real-time Objekterkennung in Bildern und Videos.
🚀 Was Sie lernen
YOLO-Architektur verstehen, vortrainierte Modelle nutzen und eigene Objekterkennungssysteme entwickeln.
YOLO mit OpenCV implementieren
import cv2 import numpy as np import argparse class YOLODetector: def __init__(self, weights_path, config_path, names_path): # YOLO-Netzwerk laden self.net = cv2.dnn.readNet(weights_path, config_path) # Klassennamen laden with open(names_path, 'r') as f: self.classes = [line.strip() for line in f.readlines()] # Output-Layer bestimmen layer_names = self.net.getLayerNames() self.output_layers = [layer_names[i[0] - 1] for i in self.net.getUnconnectedOutLayers()] # Farben für Bounding Boxes self.colors = np.random.uniform(0, 255, size=(len(self.classes), 3)) def detect_objects(self, image, confidence_threshold=0.5, nms_threshold=0.4): height, width, channels = image.shape # Bild für YOLO vorbereiten blob = cv2.dnn.blobFromImage( image, scalefactor=1/255.0, size=(416, 416), mean=(0, 0, 0), swapRB=True, crop=False ) # Forward Pass self.net.setInput(blob) outputs = self.net.forward(self.output_layers) # Erkennungen verarbeiten boxes, confidences, class_ids = [], [], [] for output in outputs: for detection in output: scores = detection[5:] class_id = np.argmax(scores) confidence = scores[class_id] if confidence > confidence_threshold: # Bounding Box Koordinaten center_x = int(detection[0] * width) center_y = int(detection[1] * height) w = int(detection[2] * width) h = int(detection[3] * height) # Box-Koordinaten berechnen x = int(center_x - w / 2) y = int(center_y - h / 2) boxes.append([x, y, w, h]) confidences.append(float(confidence)) class_ids.append(class_id) # Non-Maximum Suppression indices = cv2.dnn.NMSBoxes(boxes, confidences, confidence_threshold, nms_threshold) return boxes, confidences, class_ids, indices def draw_detections(self, image, boxes, confidences, class_ids, indices): if len(indices) > 0: for i in indices.flatten(): x, y, w, h = boxes[i] # Klasse und Konfidenz label = f"{self.classes[class_ids[i]]}: {confidences[i]:.2f}" color = self.colors[class_ids[i]] # Bounding Box zeichnen cv2.rectangle(image, (x, y), (x + w, y + h), color, 2) cv2.putText( image, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2 ) return image
use YOLO for video stream
# YOLO-Detektor initialisieren detector = YOLODetector( weights_path='yolov3.weights', config_path='yolov3.cfg', names_path='coco.names' ) def process_video(source=0): # 0 für Webcam, oder Pfad zu Video cap = cv2.VideoCapture(source) # Video-Properties fps = int(cap.get(cv2.CAP_PROP_FPS)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Video-Writer für Output (optional) fourcc = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter('output.avi', fourcc, fps, (width, height)) while True: ret, frame = cap.read() if not ret: break # Objekterkennung boxes, confidences, class_ids, indices = detector.detect_objects(frame) # Erkennungen zeichnen result_frame = detector.draw_detections( frame.copy(), boxes, confidences, class_ids, indices ) # FPS anzeigen cv2.putText( result_frame, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2 ) # Frame anzeigen cv2.imshow('YOLO Object Detection', result_frame) # Video speichern (optional) out.write(result_frame) # Beenden mit 'q' if cv2.waitKey(1) & 0xFF == ord('q'): break # Aufräumen cap.release() out.release() cv2.destroyAllWindows() # Video-Processing starten process_video(0) # Webcam verwenden
Projekt 3: Gesichtserkennung System
Entwickeln Sie ein komplettes Gesichtserkennungssystem mit Gesichtserkennung, -encoding und -verifikation.
🔒 Anwendungen
Sicherheitssysteme, Anwesenheitskontrolle, personalisierte Benutzererfahrungen.
Gesichtserkennung mit face_recognition
import face_recognition import cv2 import numpy as np import os import pickle from datetime import datetime class FaceRecognitionSystem: def __init__(self): self.known_face_encodings = [] self.known_face_names = [] self.face_locations = [] self.face_encodings = [] self.face_names = [] def load_known_faces(self, faces_dir): """Bekannte Gesichter aus Ordner laden""" print("Lade bekannte Gesichter...") for filename in os.listdir(faces_dir): if filename.endswith(('.jpg', '.jpeg', '.png')): # Bild laden image_path = os.path.join(faces_dir, filename) image = face_recognition.load_image_file(image_path) # Gesichts-Encoding extrahieren face_encodings = face_recognition.face_encodings(image) if face_encodings: # Name aus Dateiname extrahieren name = os.path.splitext(filename)[0] self.known_face_encodings.append(face_encodings[0]) self.known_face_names.append(name) print(f"Gesicht geladen: {name}") else: print(f"Kein Gesicht gefunden in: {filename}") def save_encodings(self, filepath): """Encodings speichern""" data = { 'encodings': self.known_face_encodings, 'names': self.known_face_names } with open(filepath, 'wb') as f: pickle.dump(data, f) print(f"Encodings gespeichert in: {filepath}") def load_encodings(self, filepath): """Encodings laden""" try: with open(filepath, 'rb') as f: data = pickle.load(f) self.known_face_encodings = data['encodings'] self.known_face_names = data['names'] print(f"Encodings geladen von: {filepath}") return True except FileNotFoundError: print(f"Encodings-Datei nicht gefunden: {filepath}") return False def recognize_faces_in_image(self, image): """Gesichter in einem Bild erkennen""" # Bild von BGR zu RGB konvertieren rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Gesichter finden und encodieren face_locations = face_recognition.face_locations(rgb_image) face_encodings = face_recognition.face_encodings(rgb_image, face_locations) face_names = [] for face_encoding in face_encodings: # Gesicht mit bekannten Gesichtern vergleichen matches = face_recognition.compare_faces( self.known_face_encodings, face_encoding, tolerance=0.6 ) name = "Unbekannt" # Beste Übereinstimmung finden face_distances = face_recognition.face_distance( self.known_face_encodings, face_encoding ) if matches and len(face_distances) > 0: best_match_index = np.argmin(face_distances) if matches[best_match_index]: name = self.known_face_names[best_match_index] face_names.append(name) return face_locations, face_names
Real-time Gesichtserkennung
def run_face_recognition(): # System initialisieren fr_system = FaceRecognitionSystem() # Bekannte Gesichter laden oder Encodings laden if not fr_system.load_encodings('face_encodings.pkl'): fr_system.load_known_faces('known_faces/') fr_system.save_encodings('face_encodings.pkl') # Webcam initialisieren video_capture = cv2.VideoCapture(0) # Optimierung: Nur jedes n-te Frame verarbeiten process_this_frame = True frame_count = 0 # Anwesenheitsliste attendance_log = set() while True: ret, frame = video_capture.read() # Frame-Größe reduzieren für bessere Performance small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25) # Nur jedes zweite Frame verarbeiten if process_this_frame: # Gesichtserkennung durchführen face_locations, face_names = fr_system.recognize_faces_in_image(small_frame) # Koordinaten zurückskalieren face_locations = [(top*4, right*4, bottom*4, left*4) for (top, right, bottom, left) in face_locations] process_this_frame = not process_this_frame # Ergebnisse zeichnen for (top, right, bottom, left), name in zip(face_locations, face_names): # Bounding Box color = (0, 255, 0) if name != "Unbekannt" else (0, 0, 255) cv2.rectangle(frame, (left, top), (right, bottom), color, 2) # Label cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED) cv2.putText( frame, name, (left + 6, bottom - 6), cv2.FONT_HERSHEY_DUPLEX, 0.6, (255, 255, 255), 1 ) # Anwesenheit loggen if name != "Unbekannt" and name not in attendance_log: attendance_log.add(name) timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(f"Anwesenheit: {name} um {timestamp}") # FPS und Frame-Info anzeigen frame_count += 1 cv2.putText( frame, f'Frame: {frame_count}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2 ) cv2.putText( frame, f'Anwesend: {len(attendance_log)}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2 ) # Frame anzeigen cv2.imshow('Gesichtserkennung', frame) # Beenden mit 'q' if cv2.waitKey(1) & 0xFF == ord('q'): break # Aufräumen video_capture.release() cv2.destroyAllWindows() # Finale Anwesenheitsliste ausgeben print("\nFinale Anwesenheitsliste:") for person in attendance_log: print(f"- {person}") # System starten if __name__ == "__main__": run_face_recognition()
Erweiterte Computer Vision Techniken
Style Transfer
Übertragen Sie den Stil eines Kunstwerks auf Ihre eigenen Bilder mit neuronalen Netzwerken.
import tensorflow as tf import tensorflow_hub as hub import matplotlib.pyplot as plt # Pre-trained Style Transfer Modell laden model = hub.load('https://tfhub.dev/google/magenta/arbitrary-image-stylization-v1-256/2') def load_and_preprocess_image(path, max_dim=512): image = tf.io.read_file(path) image = tf.image.decode_image(image, channels=3) image = tf.image.convert_image_dtype(image, tf.float32) # Größe anpassen shape = tf.cast(tf.shape(image)[:-1], tf.float32) long_dim = max(shape) scale = max_dim / long_dim new_shape = tf.cast(shape * scale, tf.int32) image = tf.image.resize(image, new_shape) image = image[tf.newaxis, :] return image def apply_style_transfer(content_path, style_path): # Bilder laden content_image = load_and_preprocess_image(content_path) style_image = load_and_preprocess_image(style_path) # Style Transfer anwenden stylized_image = model(tf.constant(content_image), tf.constant(style_image))[0] return stylized_image # Beispiel-Verwendung content_path = 'content_image.jpg' style_path = 'style_image.jpg' stylized = apply_style_transfer(content_path, style_path) # Ergebnis anzeigen plt.figure(figsize=(15, 5)) plt.subplot(1, 3, 1) plt.imshow(load_and_preprocess_image(content_path)[0]) plt.title('Content') plt.axis('off') plt.subplot(1, 3, 2) plt.imshow(load_and_preprocess_image(style_path)[0]) plt.title('Style') plt.axis('off') plt.subplot(1, 3, 3) plt.imshow(stylized[0]) plt.title('Stylized') plt.axis('off') plt.show()
Image Segmentation
Segmentieren Sie Bilder auf Pixel-Ebene für präzise Objekterkennung.
import cv2 import numpy as np from sklearn.cluster import KMeans def semantic_segmentation_kmeans(image, k=3): """Einfache Segmentierung mit K-Means Clustering""" # Bild in 2D Array umwandeln data = image.reshape((-1, 3)) data = np.float32(data) # K-Means Clustering criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 20, 1.0) _, labels, centers = cv2.kmeans(data, k, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS) # Cluster-Zentren zu uint8 konvertieren centers = np.uint8(centers) # Segmentiertes Bild erstellen segmented_data = centers[labels.flatten()] segmented_image = segmented_data.reshape(image.shape) return segmented_image, labels.reshape(image.shape[:2]) def watershed_segmentation(image): """Watershed algorithm for object segmentation""" gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Noise entfernen ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Morphologische Operationen kernel = np.ones((3, 3), np.uint8) opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2) # Hintergrund bestimmen sure_bg = cv2.dilate(opening, kernel, iterations=3) # Vordergrund bestimmen dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5) ret, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0) # Unbekannte Region sure_fg = np.uint8(sure_fg) unknown = cv2.subtract(sure_bg, sure_fg) # Marker erstellen ret, markers = cv2.connectedComponents(sure_fg) markers = markers + 1 markers[unknown == 255] = 0 # Watershed anwenden markers = cv2.watershed(image, markers) image[markers == -1] = [255, 0, 0] # Grenzen rot markieren return image, markers # Beispiel-Usage image = cv2.imread('example.jpg') # K-Means Segmentierung segmented_kmeans, labels = semantic_segmentation_kmeans(image, k=4) # Watershed Segmentierung segmented_watershed, markers = watershed_segmentation(image.copy()) # Ergebnisse anzeigen cv2.imshow('Original', image) cv2.imshow('K-Means Segmentation', segmented_kmeans) cv2.imshow('Watershed Segmentation', segmented_watershed) cv2.waitKey(0) cv2.destroyAllWindows()
Performance-Optimierung
- ⚡GPU acceleration: Use CUDA and OpenCL for intensive calculations
- ⚡Reduce image size: Smaller resolutions for real-time processing
- ⚡Model Optimization: TensorRT, ONNX for deployment optimization
- ⚡Batch Processing: Mehrere Bilder gleichzeitig verarbeiten
Further projects
Fortgeschrittene Projekte
- →3D Objekterkennung und Tracking
- →Augmented Reality Anwendungen
- →Medizinische Bildanalyse
- →Autonomous Vehicle Vision
Empfohlene Tools
- •OpenCV für klassische CV
- •TensorFlow/PyTorch für Deep Learning
- •Detectron2 für Objekterkennung
- •MediaPipe für Real-time CV