import pandas as pd
import requests
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
# Read the CSV file
csv_file = r"C:\Users\shrey\Downloads\student_resource_3\student_resource 3\dataset\test.csv" # Update with your actual file path
df = pd.read_csv(csv_file)
# Specify the column name containing the image URLs
image_column = 'image_link' # Update with your actual column name
# Directory to save the downloaded images
save_dir = r"D:\hackathon_2\final_test_imgs"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# Function to download a single image
def download_image(url, index):
image_name = f"image_{index}.jpg" # You can modify this to keep the original file names if available
save_path = os.path.join(save_dir, image_name)
try:
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(save_path, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return f"Downloaded {image_name}"
except Exception as e:
return f"Failed to download {url}: {e}"
# Using ThreadPoolExecutor to download images concurrently
def download_all_images_concurrently(df, max_workers=10):
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(download_image, url, index)
for index, url in df[image_column].items()
if pd.notna(url)
]
for future in tqdm(futures, total=len(futures)):
print(future.result())
# Start downloading images with 10 threads (you can adjust this number)
download_all_images_concurrently(df, max_workers=25)
print(f"Downloaded {len(df)} images.")