Katbin

import pandas as pd
import requests
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Read the CSV file
csv_file = r"C:\Users\shrey\Downloads\student_resource_3\student_resource 3\dataset\test.csv"  # Update with your actual file path
df = pd.read_csv(csv_file)

# Specify the column name containing the image URLs
image_column = 'image_link'  # Update with your actual column name

# Directory to save the downloaded images
save_dir = r"D:\hackathon_2\final_test_imgs"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Function to download a single image
def download_image(url, index):
    image_name = f"image_{index}.jpg"  # You can modify this to keep the original file names if available
    save_path = os.path.join(save_dir, image_name)
    
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
        return f"Downloaded {image_name}"
    except Exception as e:
        return f"Failed to download {url}: {e}"

# Using ThreadPoolExecutor to download images concurrently
def download_all_images_concurrently(df, max_workers=10):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(download_image, url, index)
            for index, url in df[image_column].items()
            if pd.notna(url)
        ]
        
        for future in tqdm(futures, total=len(futures)):
            print(future.result())

# Start downloading images with 10 threads (you can adjust this number)
download_all_images_concurrently(df, max_workers=25)

print(f"Downloaded {len(df)} images.")