(my apologies this doesn't seem to work well in Firefox!)
>>> import sqlalchemy as sa
>>> import pandas as pd
>>> engine = sa.create_engine('sqlite:///flight.db')
>>> c = engine.connect()
flight | timestamp | temp | press | hum | … |
---|---|---|---|---|---|
hab1 | 09:00:00 | 25.5 | 1020 | 40 | … |
hab1 | 09:01:00 | 25.6 | 1019 | 41 | … |
hab1 | 09:02:00 | 25.4 | 1020 | 42 | … |
SQL “tables” have:
They are sets of tuples (rows)
column-def::
constraint-def::
Probably the most important bit of a relational database, and what really sets it apart from your spreadsheet!
columns::
expressions::
predicate::
test::
unique-key::
check::
foreign-key::
rule::
CREATE TABLE readings (
flight VARCHAR(10) NOT NULL,
ts TIMESTAMP NOT NULL,
temp NUMERIC(3,1) NOT NULL,
pressure NUMERIC(4,0) NOT NULL,
humidity NUMERIC(3,0) NOT NULL,
accel_x REAL DEFAULT 0 NOT NULL,
accel_y REAL DEFAULT 0 NOT NULL,
accel_z REAL DEFAULT 0 NOT NULL,
CONSTRAINT readings_pk PRIMARY KEY (flight, ts),
CONSTRAINT temp_ck CHECK (temp BETWEEN -70 AND 70),
CONSTRAINT pres_ck CHECK (pressure BETWEEN 0 AND 2000),
CONSTRAINT hum_ck CHECK (humidity BETWEEN 0 AND 100)
);
TEXT
unless you really mean itNOT NULL
>>> sql = """ ... paste SQL here ... """
>>> c.execute(sql)
Yup … that's it
data::
INSERT INTO readings(flight, ts, temp, pressure, humidity)
VALUES ('hab1', '2015-01-01 09:00:00', 25.5, 1020, 40);
-- Inserting multiple rows in a single statement
INSERT INTO readings(flight, ts, temp, pressure, humidity)
VALUES
('hab1', '2015-01-01 09:01:00', 25.5, 1019, 40),
('hab1', '2015-01-01 09:02:00', 25.5, 1019, 41);
-- Inserting the results of a query
INSERT INTO readings
SELECT flight, DATETIME(ts, '+3 minutes') AS ts, temp,
pressure, humidity, accel_x, accel_y, accel_z
FROM readings;
'I am a string'
"I am NOT a string"
(in most engines)-- I am a single line comment
/* I am a multi-line comment */
>>> pd.set_option('display.width', 120)
>>> pd.read_sql('readings', c)
flight ts temp pressure humidity accel_x accel_y accel_z
0 hab1 2015-01-01 09:00:00 25.5 1020 40 0 0 0
1 hab1 2015-01-01 09:01:00 25.5 1019 40 0 0 0
2 hab1 2015-01-01 09:02:00 25.5 1019 41 0 0 0
3 hab1 2015-01-01 09:03:00 25.5 1020 40 0 0 0
4 hab1 2015-01-01 09:04:00 25.5 1019 40 0 0 0
5 hab1 2015-01-01 09:05:00 25.5 1019 41 0 0 0
COMMIT
COMMIT
says it's complete, it's safe to
take an axe to the power cable (well … for your data anyway)COMMIT
and ROLLBACK
are the usual
termsCOMMIT
/
ROLLBACK
is usually implicit
>>> t = c.begin()
>>> c.execute(""" ... some SQL ... """)
>>> t.rollback()
>>> t = c.begin()
>>> c.execute(""" ... some SQL ... """)
>>> t.commit()
This is fine at the REPL, but …
with c.begin():
c.execute(""" ... some SQL ... """)
raise Exception('Something went wrong!')
with c.begin():
c.execute(""" ... some SQL ... """)
This is more normal in code (implicit transaction end)
-- Delete a row
DELETE FROM readings WHERE ts = '2015-01-01 09:05:00';
-- Delete some rows
DELETE FROM readings WHERE ts > '2015-01-01 09:02:00';
-- Delete EVERYTHING! (mwuhahahaha)
DELETE FROM readings;
Download this (≈70Mb): www.waveform.org.uk/presentations/sql/flight-data.csv
This is a dataset (≈200k rows) of sensor readings from a HAB flight with a Raspberry Pi Sense HAT.
def load_data(filename):
c.execute("DELETE FROM readings")
sql = """
INSERT INTO readings
(flight, ts, temp, pressure, humidity,
accel_x, accel_y, accel_z)
VALUES
('hab1', ?, ?, ?, ?, ?, ?, ?)
"""
data = pd.read_csv(filename)
for row in data.itertuples():
c.execute(sql, (
row.timestamp, row.temp_h, row.pressure,
row.humidity, row.accel_x, row.accel_y,
row.accel_z
))
?
(positional parameter)
def load_data(filename):
insert = """
INSERT INTO readings
(flight, ts, temp, pressure, humidity,
accel_x, accel_y, accel_z)
VALUES
('hab1', ?, ?, ?, ?, ?, ?, ?)
"""
data = pd.read_csv(filename)
c.execute("DELETE FROM readings")
for row in data.itertuples():
c.execute(insert, (
row.timestamp, row.temp_h, row.pressure,
min(100, max(0, row.humidity)),
row.accel_x, row.accel_y, row.accel_z
))
def load_data(filename):
insert = """
INSERT INTO readings
(flight, ts, temp, pressure, humidity,
accel_x, accel_y, accel_z)
VALUES
('hab1', ?, ?, ?, ?, ?, ?, ?)
"""
data = pd.read_csv(filename)
with c.begin():
c.execute("DELETE FROM readings")
for row in data.itertuples():
c.execute(insert, (
row.timestamp, row.temp_h, row.pressure,
min(100, max(0, row.humidity)),
row.accel_x, row.accel_y, row.accel_z
))
This is ≈100× faster
def load_data(filename):
insert = """
INSERT INTO readings
(flight, ts, temp, pressure, humidity,
accel_x, accel_y, accel_z)
VALUES
('hab1', ?, ?, ?, ?, ?, ?, ?)
"""
data = pd.read_csv(filename)
data = [
(row.timestamp, row.temp_h, row.pressure,
min(100, max(0, row.humidity)),
row.accel_x, row.accel_y, row.accel_z)
for row in data.itertuples()
]
with c.begin():
c.execute("DELETE FROM readings")
c.execute(insert, data)
This is ≈2× faster. See also: to_sql()
in Pandas
-- Increase the temperature for all records
UPDATE readings SET temp = temp + 1;
-- Shift all hab1 readings to hab2
UPDATE readings SET flight = 'hab2' WHERE flight = 'hab1';
-- ... or something more complex
UPDATE readings SET temp = temp * CASE
WHEN pressure BETWEEN 0 AND 100 THEN 0.9
WHEN pressure BETWEEN 101 AND 800 THEN 1.0
WHEN pressure BETWEEN 801 AND 1000 THEN 1.1
END
WHERE pressure <= 1000;
MERGE
REPLACE
WITH (UPDATE…)
INSERT … ON CONFLICT
INSERT … IGNORE
CREATE TABLE flights (
flight VARCHAR(10) NOT NULL,
lat REAL NOT NULL DEFAULT 0.0,
long REAL NOT NULL DEFAULT 0.0,
CONSTRAINT flights_pk PRIMARY KEY (flight),
CONSTRAINT lat_ck CHECK (lat BETWEEN -90 AND 90),
CONSTRAINT long_ck CHECK (long BETWEEN -180 AND 180)
);
INSERT INTO flights(flight) VALUES ('hab1');
ALTER TABLE readings
ADD CONSTRAINT flights_fk FOREIGN KEY (flight)
REFERENCES flights(flight) ON DELETE CASCADE;
CREATE TABLE readings2 (
flight VARCHAR(10) NOT NULL,
ts TIMESTAMP NOT NULL,
temp NUMERIC(3,1) NOT NULL,
pressure NUMERIC(4,0) NOT NULL,
humidity NUMERIC(3,0) NOT NULL,
accel_x REAL DEFAULT 0 NOT NULL,
accel_y REAL DEFAULT 0 NOT NULL,
accel_z REAL DEFAULT 0 NOT NULL,
CONSTRAINT readings_pk PRIMARY KEY (flight, ts),
CONSTRAINT temp_ck CHECK (temp BETWEEN -70 AND 70),
CONSTRAINT pres_ck CHECK (pressure BETWEEN 0 AND 2000),
CONSTRAINT hum_ck CHECK (humidity BETWEEN 0 AND 100),
CONSTRAINT flights_fk FOREIGN KEY (flight)
REFERENCES flights(flight) ON DELETE CASCADE
);
PRAGMA foreign_keys=on;
INSERT INTO flights(flight) VALUES ('hab1');
INSERT INTO readings2 SELECT * FROM readings;
DROP TABLE readings;
ALTER TABLE readings2 RENAME TO readings;
FOREIGN KEY
by default
(compatibility)ALTER … DROP
(argh)Welcome to the stupidest syntax in SQL:
FROM
WHERE
GROUP BY
HAVING
SELECT
ORDER BY
LIMIT
The output of each clause is a single
table; yes, even the FROM
clause!
INSERT INTO flights(flight) VALUES ('hab2'), ('hab3');
INSERT INTO readings
SELECT
'hab2', ts, temp, pressure, humidity,
accel_x, accel_y, accel_z
FROM readings
WHERE flight = 'hab1';
INSERT INTO readings
SELECT
'hab3', ts, temp, pressure, humidity,
accel_x, accel_y, accel_z
FROM readings
WHERE flight = 'hab1';
SELECT COUNT(*) FROM readings;
DELETE FROM flights WHERE flight = 'hab3';
SELECT COUNT(*) FROM readings;
joins::
table-ref::
All joins start life as a cross product:
id | value_a |
---|---|
A | 1 |
B | 2 |
C | 3 |
id | value_b |
---|---|
A | 4 |
B | 5 |
foo.id | bar.id | foo.value_a | bar.value_b |
---|---|---|---|
A | A | 1 | 4 |
A | B | 1 | 5 |
B | A | 2 | 4 |
B | B | 2 | 5 |
C | A | 3 | 4 |
C | B | 3 | 5 |
Then the predicate is applied to filter rows:
foo.id | bar.id | foo.value_a | bar.value_b |
---|---|---|---|
A | A | 1 | 4 |
B | B | 2 | 5 |
Technically, the result set has two columns named “id”; this is normal in SQL and is why column order matters
All joins form a tree which is evaluated to form the output of the
FROM
clause:
SELECT *
FROM a
JOIN b ON a.id = b.ref_id
JOIN (
c JOIN d ON c.id = d.ref_id
) ON b.cid = c.ref_id
SELECT f.flight, r.ts, r.temp
FROM flights f JOIN readings r ON f.flight = r.flight
Some engines will perform join elimination when foreign keys permit them to, as in the above case (unfortunately SQLite is not one of them)
Yup … that's it
CREATE VIEW flight_readings AS
SELECT
r.flight, f.lat, f.long, r.ts,
r.temp, r.pressure, r.humidity,
r.accel_x, r.accel_y, r.accel_z
FROM
flights AS f
JOIN readings AS r ON f.flight = r.flight;
SQL compilers are smarter than most people realize!
SELECT flight, ts, temp FROM flight_readings;
On a smart engine this will become:
SELECT flight, ts, temp FROM readings;
Questions?