import pandas as pd import requests import os from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor # Read the CSV file csv_file = r"C:\Users\shrey\Downloads\student_resource_3\student_resource 3\dataset\test.csv" # Update with your actual file path df = pd.read_csv(csv_file) # Specify the column name containing the image URLs image_column = 'image_link' # Update with your actual column name # Directory to save the downloaded images save_dir = r"D:\hackathon_2\final_test_imgs" if not os.path.exists(save_dir): os.makedirs(save_dir) # Function to download a single image def download_image(url, index): image_name = f"image_{index}.jpg" # You can modify this to keep the original file names if available save_path = os.path.join(save_dir, image_name) try: response = requests.get(url, stream=True) if response.status_code == 200: with open(save_path, 'wb') as f: for chunk in response.iter_content(1024): f.write(chunk) return f"Downloaded {image_name}" except Exception as e: return f"Failed to download {url}: {e}" # Using ThreadPoolExecutor to download images concurrently def download_all_images_concurrently(df, max_workers=10): with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit(download_image, url, index) for index, url in df[image_column].items() if pd.notna(url) ] for future in tqdm(futures, total=len(futures)): print(future.result()) # Start downloading images with 10 threads (you can adjust this number) download_all_images_concurrently(df, max_workers=25) print(f"Downloaded {len(df)} images.")