Skip to content

Intel ARC 770 Segmentation fault #471

Open
@JannisFengler

Description

@JannisFengler

Describe the bug

I am encountering a consistent segmentation fault during the training of machine learning models using the Intel ARC 770 GPU. The fault appears after a certain number of training steps and is noticeably more frequent when working with larger models or datasets.

import torch
import torch.nn as nn
import torch.optim as optim
import intel_extension_for_pytorch as ipex
import time
import csv

torch.set_default_dtype(torch.float32)
# Define a simple neural network model with one hidden layer
class SimpleModel(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size * 2)
        self.fc3 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc4 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)

input_size = 10
hidden_size = 500
output_size = 100
num_samples = 1000
# Generate random dummy data
data = torch.randn(num_samples, input_size, dtype=torch.float32)
# Random input data
target = torch.randn(num_samples, output_size, dtype=torch.float32)  # Random target values

train_dataset = torch.utils.data.TensorDataset(data, target)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

# Initialize the model, criterion, and optimizer
model = SimpleModel(input_size, hidden_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training function
def train_one_epoch(
    train_loader: torch.utils.data.DataLoader,
    model: nn.Module,
    criterion: nn.Module,
    optimizer: optim.Optimizer,
    device: torch.device
) -> float:
    """
    Train the model for one epoch and return the average loss.
    
    Args:
        train_loader (torch.utils.data.DataLoader): DataLoader for training data.
        model (nn.Module): Neural network model.
        criterion (nn.Module): Loss function.
        optimizer (optim.Optimizer): Optimizer.
        device (torch.device): Device to run the model on (e.g., "cuda" or "cpu").
    
    Returns:
        float: Average loss for the epoch.
    """
    
    model.train()  # Set model to training mode
    total_loss = 0.0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()  # Zero out gradients
        outputs = model(data)  # Forward pass
        loss = criterion(outputs, target)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_loader)
    return average_loss


# Example usage:
device = 'xpu' 
model = model.to(device)
model, optimizer =  ipex.optimize(model, optimizer=optimizer)
criterion.to(device)

# Initialize CSV logging
csv_file = "training_log.csv"
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Epoch", "Loss", "Time (seconds)"])  # Write headers
    
    elapsed_times = []
    
    for epoch in range(100):  # Training for 10 epochs
        start_time = time.time()
        avg_loss = train_one_epoch(train_loader, model, criterion, optimizer, device)
        elapsed_time = time.time() - start_time
        elapsed_times.append(elapsed_time)
        
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Time: {elapsed_time:.2f} seconds")
        
        # Log to CSV
        writer.writerow([epoch+1, f"{avg_loss:.4f}", f"{elapsed_time:.2f}"])
    
    # Calculate average time and log to CSV
    average_time = sum(elapsed_times) / len(elapsed_times)
    writer.writerow(["Average", "-", f"{average_time:.2f}"])

Error
Epoch 62, Loss: 0.9934, Time: 0.08 seconds
Epoch 63, Loss: 0.9946, Time: 0.08 seconds
Traceback(most recent call last):
File "data/dummy_intel.py", line 97, in
avg_loss = train_one_epoch(train_loader, model, criterion, optimizer, device)
File "data/dummy_intel.py", line 75, in train_one_epoch
avg_loss += loss.item()
RuntimeError: Native API failed. Native API returns: -1 (CL_DEVICE_NOT_FOUND) -1 (CL_DEVICE_
NOT_FOUND)
Segmentation fault

Versions

Intel XPU Docker image
2d2a3356c190
in WSL Windows 11

(But I got the same error in native Ubuntu)

Metadata

Metadata

Assignees

No one assigned

    Labels

    ARCARC GPUCrashExecution crashes

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions