This script demonstrates how to ingest large CSV files into a PostgreSQL database using
pandas
and SQLAlchemy
, with chunking and timestamp parsing.
Data: NYC Taxi
# shebang
#!/usr/bin/env python
# coding: utf-8
from time import time
import pandas as pd
from sqlalchemy import create_engine
# Read the first 100 rows from CSV
df = pd.read_csv('download.csv', nrows=100)
# Connect to PostgreSQL
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')
# Stream CSV file in chunks of 100,000 rows
df_iter = pd.read_csv('download.csv', iterator=True, chunksize=100000)
# Read the first chunk
df = next(df_iter)
# Convert timestamp columns
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime, format='mixed')
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime, format='mixed')
# Create the SQL table structure using the first 0 rows
df.head(0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')
# Insert the first chunk
df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
# Insert remaining chunks in a loop
while True:
t_start = time()
df = next(df_iter)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime, format='mixed')
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime, format='mixed')
df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
t_end = time()
print('Inserted another chunk, took %.3f seconds' % (t_end - t_start))
download.csv
is available in the working directory