SQL Flashcards

1
Q

Like

A
  • Used to search for a pattern in a field % match zero, one, or many characters
    ~~~
    SELECT name
    FROM people
    WHERE name LIKE ‘Ade%’;
    ~~~
  • match a single character
    ~~~
    SELECT name
    FROM people
    WHERE name LIKE E_’;
    ~~~
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
2
Q

Not like

A
SELECT name
FROM people
WHERE name NOT LIKE 'A.%';
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
3
Q

Wildcard position

A
SELECT name
FROM people
WHERE name LIKE '%г*;

Returns = A.J. Langer

SELECT name
FROM people
WHERE name LIKE _
_t%';

returns = Anthony

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
4
Q

Where, In

A
WHERE, IN
SELECT title
FROM films
WHERE release_year IN (1920, 1930, 1940);
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
5
Q

Missing values

A

Missing values
* COUNT (field_name) includes only non-missing values
* COUNT (*) includes missing values

null
* Missing values:
* Human error
* Information not available

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
6
Q

Is Not Null

A

IS NOT NULL
~~~
SELECT COUNT (*) AS no_birthdates
FROM people
WHERE birthdate IS NOT NULL;
~~~

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
7
Q

Summarizing data: Aggregate functions

A

AVG() , SUM() , MIN() , MAX() , COUNT ()

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
8
Q

Aggregate functions vs. arithmetic

A

Aggregate functions : aggregates columns
Arithmetic functions : aggregates rows

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
9
Q

Order of execution

A
  • Step 1: FROM
  • Step 2: WHERE
  • Step 3: SELECT (aliases are defined here)
  • Step 4: LIMIT
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
10
Q

Round

A

ROUND (number_to_round, decimal_places)
~~~
SELECT ROUND (AVG (budget), 2) AS avg_budget
FROM films
WHERE release_year >= 2010;
~~~

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
11
Q

HAVING

A
SELECT release_year,
COUNT (title) As title_count
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;

Aggregate functions have to be in this order, group by and then having. You can’t use count

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
12
Q

USING

A
Inner join of presidents and prime_ministers, joining on country
SELECT p1.country, pl.continent, prime_minister, president
FROM prime_ministers AS p1
INNER JOIN presidents AS p2
USING (country);
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
12
Q

ON

A
Inner join of presidents and prime_ministers, joining on country
SELECT prime_ministers.country, prime_ministers.continent, prime_minister, president
FROM prime_ministers
INNER JOIN presidents
ON prime_minsters.country = presidents. country;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
13
Q

Chaining Joins

A
SELECT
p1. country, p1. continent, president, prime_minister, pm_start
FROM prime ministers as p1
INNER JOIN presidents as p2
USING(country)
INNER JOIN prime_minister_terms as p3
USING (prime_minister);

Chaining joins + and statements

SELECT name, e.year, fertility_rate, unemployment_rate
FROM countries AS c
INNER JOIN populations AS p
ON c. code = p. country_code
INNER JOIN economies AS e
ON c. code = e. code
-- Add an additional joining condition such that you are also joining on year
and p.year = e.year
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
14
Q

Cross Join

A

Returns all possible combinations

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
15
Q

Union Syntax

A

Returns unique values

SELECT *
FROM left table
UNION
SELECT *
FROM right_table;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
16
Q

Union All Syntax

A

Returns duplicate values
~~~
SELECT *
FROM left_table
UNION ALL
SELECT *
FROM right_table;
~~~

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
17
Q

Subqueries in where clause - semi join

A
SELECT
   president,
   country,
   continent 
FROM
   presidents 
WHERE
   country IN 
   (
      SELECT
         country 
      FROM
         states 
      WHERE
         indep _year < 1800
   )
;
  • SELECT country,
         president
    FROM presidents
    WHERE continent LIKE '%America'
    AND country NOT IN
      (SELECT country
       FROM states
       WHERE indep_year < 1800);
  • SELECT *
    FROM populations
    Filter for only those populations where life expectancy is 1.15 times higher than average
    WHERE life _expectancy > 1.15 *
    (SELECT AVG (life_expectancy)
    FROM populations
    WHERE year = 2015)
    AND year=2015;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
18
Q

Subqueries in select clause

A
SELECT DISTINCT continent,
(SELECT COUNT (*)
FROM monarchs
WHERE states.continent = monarch.continent) AS monarch_count
FROM states;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
19
Q

Subqueries in from clause - self join

A
SELECT DISTINCT monarchs.continent, sub.most_recent
FROM monarchs,
(SELECT
continent,
MAX(indep_year AS most_recent
FROM states
GRAUP BY continent) AS sub
WHERE monarchs.continent = sub.continent

Example 2

select name, country_code, city_proper_pop, metroarea_pop, (city_proper_pop / metroarea_pop 100 as city perc 
from cities
where name IN
( select capital
where continent like %Furone%' or continent LIKE %America')
 and metroarea_pop is not null
order by city_perc DESC
limit 10

example 3
~~~
SELECT
– Select country name and the count match IDs
c. AS country_name,
COUNT(sub) AS matches
FROM country AS c
– Inner join the subquery onto country
– Select the country id and match id columns
inner join (SELECT country_id, id
FROM match
– Filter the subquery by matches with 10+ goals
WHERE (home_goal + away_goal) >= 10 ) AS sub
ON c.country = sub.id
GROUP BY country_name;
~~~

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
20
Q

CASE WHEN

A
SELECT 

CASE
           WHEN hometeam_id = 10189 THEN 'FC Schalke 04'
           WHEN hometeam_id = * 9823 THEN 'FC Bayern Munich' 
					 ELSE 'Other' END AS home team,
					 
					 COUNT(id) AS total_matches
FROM matches germany
GROUP BY the CASE STATEMENT
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
21
Q

True or false: Subqueries in where can only return one column

A

True

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
22
Q

True or false: subqueries in select return a single aggregate value

A

True

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
23
Q

When Condition

A
SELECT date, hometeam_id, awayteam_id,
CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
THEN 'Chelsea home win!'
WHEN awayteam_id = 8455 AND home_goal < away_goal
THEN 'Chelsea away win!'
ELSE 'Loss or tie :(' END AS outcome
FROM match
WHERE hometeam_id = 8455 OR awayteam_id = 8455;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
24
Q

Case when in select statement

A
SELECT
season,
COUNT(CASE WHEN hometeam id = 8650
AND home_goal > away_goal
THEN id END) AS home_wins
FROM match
GROUP BY season;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
25
Q

Window Function : Over clause

A
SELECT
date,
(home_goal + away_goal) AS goals,
AVG (home_goal + away_goal) OVER() AS overall_avg
FROM match
WHERE season = '2011/2012';
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
26
Q

string_agg

A

Enter STRING AGG
* STRING_AGG (column, separator) takes all the values of a column and concatenates them, with separator in between each value
STRING_AGG (Letter, ‘, “) transforms this…

How well did you know this?
1
Not at all
2
3
4
5
Perfectly
27
Q

Over with a partition

A
SELECT
date,
(home_goal + away_goal) AS goals,
AVG (home_goal + away_goal) OVER(PARTITION BY season) AS season_avg
FROM match;
How well did you know this?
1
Not at all
2
3
4
5
Perfectly
28
Q

Partition with multiple columns

A
SELECT
c. name,
m. season,
(home_goal + away_goal) AS goals,
AVG (home_goal + away_goal)
OVER(PARTITION BY m.season, c.name) AS season_ctry_avg
FROM country AS c
LEFT JOIN match AS m
on c.id = m.country_id
29
Q

Sliding Windows

A

Sliding window frame
~~~
– Manchester City Home Games
SELECT date,
home_goal, away_goal,
SUM (home_goal)
OVER(ORDER BY date
ROWS BETWEEN 1 PRECEDING
AND CURRENT ROW) AS last2
FROM match
WHERE hometeam_ id = 8456
AND season = ‘2011/2012’;
~~~

Sliding window keywords
ROWS BETWEEN <start> AND <finish>
PRECEDING
FOLLOWING
UNBOUNDED PRECEDING
UNBOUNDED FOLLOWING
CURRENT ROW</finish></start>

30
Q

CTE

A
-- Set up the home team CTE
WITH home AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		   WHEN m.home_goal < m.away_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.hometeam_id = t.team_api_id),
-- Set up the away team CTE
away AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Loss'
		   WHEN m.home_goal < m.away_goal THEN 'MU Win' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.awayteam_id = t.team_api_id)
-- Select team names, the date and goals
SELECT DISTINCT
    m.date,
    home.team_long_name AS home_team,
    away.team_long_name AS away_team,
    m.home_goal, m.away_goal
-- Join the CTEs onto the match table
FROM match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
      AND (home.team_long_name = 'Manchester United' 
           OR away.team_long_name = 'Manchester United');
31
Q

Window function : Row number

A
SELECT
Year, Event, Country,
ROW_NUMBER() OVER () AS Row_N
FROM Summer_Medals
WHERE
Medal = 'Gold';

OR

```
SELECT
Year,
– Assign numbers to each year
row_number() Over() AS Row_N
FROM (
SELECT year
FROM Summer_Medals
ORDER BY Year ASC
) AS Years
ORDER BY Year ASC;
~~~

OR

Here the biggest will be assigned a rank of 1
~~~
_SELECT
Year, Event, Country,
ROW NUMBER() OVER
(ORDER BY Year DESC, Event ASC) AS ROW_N
FROM Summer_Medals
WHERE
Medal = ‘Gold’;
~~~

32
Q

Window function: LAG

A

Lag lets you get the last value , like creating a table with this years winner vs last years

WITH Discus_Gold AS (
SELECT
Year, Country AS Champion
FROM Summer_Medals
WHERE
Year IN (1996, 2000, 2004, 2008, 2012)
AND Gender = 'Men' AND Medal = 'Gold'
AND Event = 'Discus Throw')
SELECT
Year, Champion,
LAG (Champion, 1) OVER
(ORDER BY Year ASC) AS Last_Champion
FROM Discus_Gold
ORDER BY Year ASC;
33
Q

Window function: Partition

A

Finding last years champion for example:
~~~

WITH Tennis_Gold AS (
SELECT DISTINCT
Gender, Year, Country
FROM Summer_Medals
WHERE
Year >= 2000 AND
Event = ‘Javelin Throw’ AND
Medal = ‘Gold’)

SELECT
Gender, Year,
Country AS Champion,
– Fetch the previous year’s champion by gender
LAG(Country,1) OVER (PARTITION BY gender
ORDER BY Year ASC) AS Last_Champion
FROM Tennis_Gold
ORDER BY Gender ASC, Year ASC;
~~~

to partition more that one thing : partition by gender,event

34
Q

Window Functions: Fetching

A

The four functions
Relative
* LAG (column, n) returns column’s value at the row n rows before the current row
* LEAD (column, n) returns column’s value at the row n rows after the current row
Absolute
* FIRST_VALUE (column) returns the first value in the table or partition
* LAST VALUE (column) returns the last value in the table or partition

EXAMPLE A = LEAD Query
~~~
WITH Hosts AS (
SELECT DISTINCT Year, City
FROM Summer_Medals)
SELECT
Year, City,
LEAD (City, 1) OVER (ORDER BY Year ASC) AS Next_City,
LEAD (City, 2) OVER (ORDER BY Year ASC) AS After_Next_City
FROM Hosts
ORDER BY Year ASC;
~~~

EXAMPLE B =
~~~
SELECT
Year, City,
FIRST_VALUE (City) OVER
(ORDER BY Year ASC) AS First_City, LAST_VALUE(City) OVER (
ORDER BY Year ASC
RANGE BETWEEN
UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING
) AS Last_City
FROM Hosts
ORDER BY Year ASC;
~~~

EXAMPLE C :
~~~

WITH Hosts AS (
SELECT DISTINCT Year, City
FROM Summer_Medals)

SELECT
Year,
City,
– Get the last city in which the Olympic games were held
Last_value(city) OVER (
ORDER BY year ASC
RANGE BETWEEN
UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING
) AS Last_City
FROM Hosts
ORDER BY Year ASC;
~~~

35
Q

Window function: ranking

A

The ranking functions
* ROW_NUMBER() always assigns unique numbers, even if two rows’ values are the same
* RANK() assigns the same number to rows with identical values, skipping over the next numbers in such cases
* DENSE_RANK() also assigns the same number to rows with identical values, but doesn’t skip over the next numbers

WITH Country_Games AS (...)
SELECT
Country, Games,
ROW NUMBER ()
OVER (ORDER BY Games DESC) AS ROW_N
FROM Country_Games
ORDER BY Games DESC, Country ASC;
36
Q

Window function: Paging/Frames

A

What is paging?
* Paging: Splitting data into (approximately) equal chunks
* Uses
* Many APIs return data in “pages” to reduce data being sent
* Separating data into quartiles or thirds (top middle 33%, and bottom thirds) to judge performance
Enter NTILE
* NTILE(n) splits the data into n approximately equal pages

WITH Country_Medals AS (
SELECT
Country, COUNT(* AS Medals
FROM Summer_Medals
GROUP BY Country),
SELECT
Country, Medals,
NTILE (3) OVER (ORDER BY Medals DESC) AS Third
FROM Country_Medals;

The you can group the results
~~~

WITH Country_Medals AS (…),
Thirds AS (
SELECT
Country, Medals,
NTILE (3) OVER (ORDER BY Medals DESC AS Third
FROM Country_Medals)
SELECT
Third,
ROUND(AVG (Medals), 2) AS Avg_Medals
FROM Thirds
GROUP BY Third
ORDER BY Third ASC;
~~~

37
Q

Window Functions: Aggregation

A
WITH Brazil_Medals AS (...)
SELECT
Year, Medals,
MAX(Medals)
OVER (ORDER BY Year ASC) AS Max_ Medals
FROM Brazil_Medals;
38
Q

Window Functions : Frames

A

ROWS BETWEEN
* ROWS BETWEEN [START AND [FINISH]
o n PRECEDING: n rows before the current row
* CURRENT ROW: the current row
o n FOLLOWING : n rows after the current row
Examples
* ROWS BETWEEN 3 PRECEDING AND CURRENT ROW = 4 rows
* ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING = 3 rows
* ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING = 5 rows

WITH Russia_Medals AS (...)
SELECT
Year, Medals,
MAX(Medals)
OVER (ORDER BY Year ASC) AS Max_Medals,
MAX (Medals)
OVER ORDER BY Year ASC
ROWS BETWEEN
1 PRECEDING AND CURRENT ROW)
AS Max_Medals_Last
FROM Russia_Medals
ORDER BY Year ASC;
39
Q

Window Function: Moving Averages/Sum

A
  • Moving average (MA): Average of last n periods
  • Example: 10-day MA of units sold in sales is the average of the last 10 days’ sold units
  • Used to indicate momentum/trends
  • Also useful in eliminating seasonality
  • Moving total: Sum of last n periods
  • Example: Sum of the last 3 Olympic games’ medals
  • Used to indicate performance; if the sum is going down, overall performance is going down
WITH US Medals AS (...)
SELECT
Year, Medals, AVG(Medals) OVER
(ORDER BY Year ASC
ROWS BETWEEN
2 PRECEDING AND CURRENT ROW) AS Medals_MA
FROM US_Medals
ORDER BY Year ASC;

Note moving averages cant see into the future, so its always going to end on current row

40
Q

Pivots/Crosstab

A

Enter CROSSTAB
~~~
CREATE EXTENSION IF NOT EXISTS tablefunc;
SELECT * FROM CROSSTAB ($$ source_sql TEXT
$$) AS ct (column_1 DATA_TYPE_1,
column_2 DATA_TYPE_2,
* . . ,
column_n DATA_TYPE_N);
~~~

Example
~~~
CREATE EXTENSION IF NOT EXISTS tablefunc;
SELECT * FROM CROSSTAB($$
SELECT
Country, Year, COUNT(*) :: INTEGER AS Awards
FROM Summer_Medals
WHERE
Country IN (‘CHN’, ‘RUS’, ‘USA’) AND Year IN (2008, 2012)
AND Medal = ‘Gold’
GROUP BY Country, Year
ORDER BY Country ASC, Year ASC;
$$) AS ct (Country VARCHAR, “2008” INTEGER, “2012” INTEGER
ORDER BY Country ASC;
~~~

41
Q

ROLLUP and CUBE

A
SELECT
Country, Medal, COUNT(*) AS Awards
FROM Summer_Medals
WHERE
Year = 2008 AND Country IN ('CHN', 'RUS')
GROUP BY Country, ROLLUP(Medal)
ORDER BY Country ASC, Medal ASC;
  • ROLLUP is a GROUP BY subclause that includes extra rows for group-level aggregations
  • GROUP BY Country, ROLLUP(Medal) will count all Country - and Medal -level totals, then count only Country -level totals and fill in Medal with nulls for these rows

Enter CUBE
~~~
SELECT
Country, Medal, COUNT(* AS Awards
FROM summer_medals
WHERE
Year = 2008 AND Country IN (‘CHN’, ‘RUS’)
GROUP BY CUBE(Country, Medal)
ORDER BY Country ASC, Medal ASC;
~~~

  • CUBE is a non-hierarchical ROLLUP
  • It generates all possible group-level aggregations
  • CUBE (Country, Medal) counts Country -level, Medal -level, and grand totals
42
Q

COALESCE

A

Enter COALESCE
* COALESCE() takes a list of values and returns the first non- null value, going from left to right
* COALESCE (null, null, 1, null, 2) ? 1
* Useful when using SQL operations that return nulls
* * * ROLLUP and CUBE
* * * Pivoting
* * * LAG and LEAD

SELECT
COALESCE(Country, 'Both countries') AS Country,
COALESCE (Medal, 'All medals') AS Medal,
COUNT (*) AS Awards
FROM summer_medals
WHERE
Year = 2008 AND Country IN ('CHN', 'RUS')
GROUP BY ROLLUP(Country, Medal)
ORDER BY Country ASC, Medal ASC;

it replaces the nulls with ‘both countries’ and ‘all medals’ respectively

43
Q

STRING_AGG

A

STRING_AGG

  • STRING_AGG (column, separator) takes all the values of a column and concatenates them, with separator in between each value
    STRING_AGG(Letter, “, “) transforms this…
44
Q

How to query for data type

A

Determining data types from existing tables
~~~
SELECT
column_name, data_type
FROM INFORMATION_SCHEMA.COLUMNS
WHERE column name in (‘title’, ‘description’, ‘special_features’)
AND table_name =’film’;
~~~

45
Q

INTERVAL data types

A
SELECT rental_date + INTERVAL '3 days' as expected_return
from rental
46
Q

Creating Tables in Postgres

A

Before we get started, creating tables in sql
~~~
CREATE TABLE example
CREATE TABLE my_first_table ( first_column text, second_column integer);
INSERT example
INSERT INTO my_first_table
(first_column, second_column) VALUES (‘text value’, 12);
~~~

creating tables in postgress, supports arrays

CREATE TABLE grades ( student_id int, email text[][], test_scores int[]
);

Note that PostgreSQL array indexes start with one and not zero.

47
Q

The ANY function in Postgress

A

The ANY function allows you to search for a value in any index position of an ARRAY. Here’s an example.
WHERE 'search text' = ANY(array_name)

The contains operator @> operator is alternative syntax to the ANY function and matches data in an ARRAY using the following syntax.
~~~

WHERE array_name @> ARRAY[‘search text’] :: type[]
WHERE special_features @> ARRAY[‘Deleted Scenes’];

~~~

48
Q

DATES in Postgres

A

Calculating time periods with AGE
SELECT AGE(timestamp '2005-09-11 00:00:00', timestamp '2005-09-09 12:00:00');
~~~

SELECT INTERVAL ‘1’ day * timestamp ‘2019-04-10 12:34:56’
~~~

Retrieving the current timestamp
SELECT NOW::timestamp;
SELECT CAST(NOW() as timestamp);
~~~

SELECT CURRENT_ TIMESTAMP (2);
~~~

CURRENT DATE
SELECT CURRENT_DATE;

Extracting and transforming date / time data
* EXTRACT field FROM source
~~~
SELECT EXTRACT (quarter FROM timestamp 2005-01-24 05:12:00’) AS quarter;
~~~
* DATE_PART(‘field’, source)
~~~
SELECT DATE_PART (‘quarter’, timestamp ‘2005-01-24 05:12:00’) AS quarter;
~~~

EXAMPLE A
~~~
SELECT
EXTRACT (quarter FROM payment_date) AS quarter,
EXTRACT (year FROM payment_date) AS year,
SUM (amount) AS total_payments
FROM
payment
GROUP BY 1, 2;
~~~

Truncating timestamps using DATE_TRUNCO
SELECT DATE_TRUNC ('year', TIMESTAMP 2005-05-21 15:30:30');

49
Q

Postgres Date

A
SELECT
c.first_name || ' ' |I c. last_name AS customer_name, f.title, r.rental_date,
- - Extract the day of week date part from the rental_date
EXTRACT (dow FROM r. rental_date) AS dayofweek,
AGE (r.return_date, r.rental_date) AS rental_days,
- Use DATE_TRUNC to get days from the AGE function
CASE WHEN DATE_TRUNC(' day', AGE(r. return_date, r.rental_date)) >
- - Calculate number of d
f.rental_duration * INTERVAL '1' day
THEN TRUE
ELSE FALSE END AS past_due
FROM
film AS f
INNER JOIN inventory AS i
ON f.film_id = i.film_id
INNER JOIN rental AS r
ION i.inventory_id = r. inventory_id
INNER JOIN customer AS c
ON c.customer_id = r. customer_id
WHERE
- Use an INTERVAL for the upper bound of the rental_date r.rental_date BETWEEN CAST ('2005-05-01' AS DATE) AND CAST ('2005-05-01' AS DATE) + INTERVAL '90 day';
50
Q

String Concatination

A
SELECT
first_name, last_name,
first_name |1 ' ' I last_name AS full_name
FROM customer

String concatenation with functions
~~~
SELECT
CONCAT (first_name,’ ‘, last_name) AS full_name
FROM customer;
~~~

51
Q

String Manipulation: Upper, lower, Title caps, replace

A
SELECT
UPPER (email)
FROM customer;

Lower, initcap

Replacing characters in a string
~~~
SELECT
REPLACE(description, ‘A Astounding’,
‘An Astounding’) as description
FROM film;
~~~

52
Q

Advanced String Functions

A

Char Length = length of characters in a string
~~~
SELECT
title, CHAR_LENGTH(title) or length(title)l
FROM film;
~~~

Index/position of a character in a string
~~~
SELECT
email,
POSITION(‘@’ IN email)
FROM customer;
~~~

Parsing : extract first n position, you can also use left
~~~
SELECT
RIGHT (description, 50)
FROM film;
~~~

Substring
~~~
SELECT
FROM
SUBSTRING(description, 10, 50)
film AS f;
~~~

Extracting substrings of character data
~~~
SELECT
SUBSTRING (email FROM 0 FOR POSITION(‘@’ IN email))
FROM
customer;
~~~

Extracting substrings of character data
~~~
SELECT
FROM
SUBSTRING(email FROM POSITION(‘@’ IN email)+1 FOR CHAR_LENGTH (email))
customer
~~~

EXAMPLE :

SELECT 
  UPPER(c.name) || ': ' || f.title AS film_category, 
  -- Truncate the description without cutting off a word
  left(description, 50 - 
    -- Subtract the position of the first whitespace character
    position(
      ' ' IN REVERSE(LEFT(description, 50))
    )
  ) 
FROM 
  film AS f 
  INNER JOIN film_category AS fc 
  	ON f.film_id = fc.film_id 
  INNER JOIN category AS c 
  	ON fc.category_id = c.category_id;
53
Q

Truncate, replace, over write

A

Trimming white space
~~~
SELECT TRIM(‘ padded”);
~~~

TRIM([leading | trailing | both] [characters] from string)

Padding strings with character data
SELECT LPAD('padded', 10, '#');

concat & pad
–result = Noor Habbal
~~~
SELECT
first_name || lpad(last_name, LENGTH(last_name)+1) AS full_name
FROM customer;
~~~

54
Q

Full text search

A

What is full-text search?
Full text search provides a means for performing natural language queries of text data in your database.
* Stemming
* Spelling mistakes
* Ranking

LIKE versus full-text search
~~~
SELECT title, description
FROM film
WHERE to_tsvector (title) @@ to_tsquery (‘elf’);
~~~

SELECT 
  title, 
  description, 
  -- Calculate the similarity
  similarity(description, 'Astounding & Drama')
FROM 
  film 
WHERE 
  to_tsvector(description) @@ 
  to_tsquery('Astounding & Drama') 
ORDER BY 
  similarity(description, 'Astounding & Drama')  DESC;

tsvector vectorizes a word, to_tsvector does a count of each word ‘ china’:90

55
Q

creating functions

A

User-defined data types
Enumerated data types
~~~
CREATE TYPE dayofweek AS ENUM (
‘Monday’,
‘Tuesday’,
‘Wednesday’,
‘Thursday’,
‘Friday’,
‘Saturday’,
‘Sunday’
):
~~~

query user-defined types
~~~
SELECT typname, typcategory
FROM pg_type
WHERE typname=’dayofweek’;
~~~

get more information about a table
~~~
SELECT column_name, data_type, udt_name
FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_name =’film’;
~~~

User-defined functions
~~~
CREATE FUNCTION squared(i integer) RETURNS integer AS $$
BEGIN
RETURN i * i;
END;
$$ LANGUAGE plpgsal;
~~~

– Select the column name, data type and udt name columns
~~~
SELECT column_name, data_type, udt_name
FROM INFORMATION_SCHEMA.COLUMNS
– Filter by the rating column in the film table
WHERE table_name =’film’ AND column_name=’rating’;
~~~

pg_type is the postgress catalog table, which holds information about all the datatypes in the databse

56
Q

Postgres extentions

A

PostgreSQL extensions - Commonly used extensions

  • PostGIS
  • PostPic
  • fuzzystrmatch
  • pg_trgm

Available Extensions
~~~
SELECT name
FROM pg_available_extensions;
~~~

Installed Extensions
~~~
SELECT extname
FROM pg_extension;
~~~

–Enable the fuzzystrmatch extension
~~~
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
~~~
–Confirm that fuzzstrmatch has been enabled
SELECT extname FROM pg_extension;
SELECT * FROM pg_extension;

Using fuzzystrmatch or fuzzy searching
~~~
SELECT levenshtein(‘GUMBO’, ‘GAMBOL’);
~~~

Compare two strings with pg_trgm
SELECT similarity ('GUMBO', 'GAMBOL');

57
Q

Reminders of NULL, IS NULL, count(col), count(*)

A

NULL = missing
IS NULL, IS NOT NULL don’t use = NULL
count (*) = number of rows
count (column_name) = number of non- NULL values
count (DISTINCT column_name) = number of different non- NULL values
SELECT DISTINCT column_name = distinct values, including NULL

58
Q

Coalesce function

A

Return the first non-null value in a list

Coalesce(col2, col3)

SELECT coalesce(industry, sector, 'Unknown') AS industry2,

so this code checks industry, if the row is null, it replaces it with the value from sector, if that’s null, it replaces it with unknown

59
Q

Casting

A

Format
– With the CAST function
SELECT CAST (value AS new_type);

60
Q

Column constraints

A
  • Foreign key: value that exists in the referenced column, or NULL
  • Primary key: unique, not NULL
  • Unique: values must all be different except for NULL
  • Not null: NULL not allowed: must have a value
  • Check constraints: conditions on the values o column1 > 0
  • columnA > columnB
61
Q

Summary Statistics

A
-- Summarize by group with GROUP BY
SELECT tag,
min (question_pct), avg (question_pct), max (question_pct)
FROM stackoverflow
GROUP BY tag;

standard deviation
~~~
SELECT stddev(question_pct)
FROM stackoverflow;
~~~

Sample Variance , divides by the number of values -1
~~~
SELECT var_samp(question_pct)
FROM stackoverflow;
~~~

population variance, difference is -1
~~~
SELECT var_pop (question_pot)
FROM stackoverflow;
~~~

62
Q

Summary function : Truncating

A

Truncate
SELECT trunc (42.1256, 2);

this is great is you’re counting and the values in the column are like 21, 22, 23. So you can use truncate to do truncate -1, and they’ll all be grouped in the 20 bin.
Esentially you can use it to examine distributions
~~~

– Truncate employees
SELECT
trunc(employees, -4) AS employee_bin,
– Count number of companies with each truncated value
count(title)
FROM
fortune500
WHERE
employees < 100000
GROUP BY
employee_bin
ORDER BY
employee_bin;
~~~

63
Q

Summary function : Generating series and creating bins

A
- - Create bins WITH bins AS (
  SELECT 
    generate_series (30, 60, 5) AS lower, 
    generate_series (35, 65, 5) AS upper
), 
-- Subset data to tag of interest ebs AS (
SELECT 
  Unanswered_count 
FROM 
  stackoverflow 
WHERE 
  tag = 'amazon-ebs'
) - - Count values in each bin 
SELECT 
  lower, 
  upper, 
  count (unanswered_count) -- left join keeps all bins
FROM 
  bins 
  LEFT JOIN ebs ON unanswered_count >= lower 
  AND Unanswered_count < upper

1) find the min and max, but using the min and max functions in sql query
2) use generate series to create the bins
3) then put the values in the query above

64
Q

Summary Functions : Corr,Median/ Percentile functions

A

correlation function
~~~
SELECT corr(assets, equity)
FROM fortune500;
~~~

Median/ Percentile functions - lets say you want the top 20% of , so you take the .8 percentile
– percentile between 0 and 1
EXAMPLE = col A , values = 1345

SELECT percentile_disc(percentile) WITHIN GROUP (ORDER BY column_name)
FROM table;

^ returns 3

SELECT percentile_cont (percentile) WITHIN GROUP (ORDER BY column_name)
FROM table;

^ returns 3.5

create temp table profit80 AS 
  SELECT sector, 
         percentile_disc(.8) within group (order by profits) AS pct80
    from fortune500 
   group by sector;
65
Q

Create temp table, drop table

A
Create Temp Table Syntax
- - Create table as
CREATE TEMP TABLE new_tablename AS
-- Query results to store in the table
SELECT column1, column2
FROM table;

DROP TABLE IF EXISTS top_companies;

EXAMPLE:
~~~
DROP TABLE IF EXISTS profit80;

CREATE TEMP TABLE profit80 AS
SELECT sector,
percentile_disc(0.8) WITHIN GROUP (ORDER BY profits) AS pct80
FROM fortune500
GROUP BY sector;

SELECT title, fortune500.sector,
profits, profits/pct80 AS ratio
FROM fortune500
LEFT JOIN profit80
ON fortune500.sector =profit80.sector
WHERE profits > pct80;
~~~

66
Q

Interesting correlation table

A
DROP TABLE IF EXISTS correlations;

CREATE TEMP TABLE correlations AS
SELECT 'profits'::varchar AS measure,
       corr(profits, profits) AS profits,
       corr(profits, profits_change) AS profits_change,
       corr(profits, revenues_change) AS revenues_change
  FROM fortune500;

INSERT INTO correlations
SELECT 'profits_change'::varchar AS measure,
       corr(profits_change, profits) AS profits,
       corr(profits_change, profits_change) AS profits_change,
       corr(profits_change, revenues_change) AS revenues_change
  FROM fortune500;

INSERT INTO correlations
SELECT 'revenues_change'::varchar AS measure,
       corr(revenues_change, profits) AS profits,
       corr(revenues_change, profits_change) AS profits_change,
       corr(revenues_change, revenues_change) AS revenues_change
  FROM fortune500;

-- Select each column, rounding the correlations
SELECT measure, 
       ROUND(profits::numeric, 2) AS profits,
        ROUND(profits_change::numeric, 2) AS profits_change,
       ROUND(revenues_change::numeric, 2) AS revenues_change
  FROM correlations;

this creates a table like

measure —- profits —- profits_change
profits —– 1 —— 2
profits_change - 1 —— 2

67
Q

Exploring categorical data and unstructured text

A
-- To clear table if it already exists
DROP TABLE IF EXISTS indicators;

-- Create the temp table
CREATE TEMP TABLE indicators AS
  SELECT id, 
         CAST (description LIKE '%@%' AS integer) AS email,
         CAST (description LIKE '%\_\_\_-\_\_\_-\_\_\_\_%' AS integer) AS phone 
    FROM evanston311;
  
-- Select the column you'll group by
SELECT priority,
       -- Compute the proportion of rows with each indicator
       SUM(email)/COUNT(*)::numeric AS email_prop, 
       SUM(phone)/COUNT(*)::numeric AS phone_prop
  -- Tables to select from
  FROM evanston311
       left JOIN indicators
       -- Joining condition
       ON evanston311.id=indicators.id
 -- What are you grouping by?
 GROUP BY priority;

________________________________________________________________________
<br></br>
SELECT*FROM fruit
WHERE lower(fav_fruit)=’apple’;

<br></br>
case insensitive = LIKE %apple%
<br></br>

trimming

trim : bothendstrim(‘ abc ‘)=’abc’
rtrim
ltrim

SELECTtrim(‘Wow!’, ‘!’); Wow
SELECTtrim(‘Wow!’, ‘!wW’); o

<br></br>
subbstring

SELECT left(‘abcde’, 2), = ab
<br></br>

Substring

SELECT substring(string FROM start FOR length)

SELECTsubstring(‘abcdef’ FROM 2 FOR 3) = bcd
<br></br>
split part
SELECT split_part(‘a,bc,d’, ‘,’, 2); bc
<br></br>
concat
SELECT concat(‘a’, 2, ‘cc’);
SELECT’a’||2||’cc’
<br></br>
~~~
– Case for each of :, -, and I
SELECT CASE WHEN category LIKE ‘%: %’ THEN split_part(category, ‘: ‘, 1)
WHEN category LIKE ‘% - %’ THEN split_part (category, ‘ - ‘, 1)
ELSE split_part(category, ‘ | ‘, 1)
END AS major_category,
– alias the result
sum (businesses)
- - also select number of businesses
FROM naics
GROUP BY major_category;
– Group by categories created above
~~~

68
Q

dates

A

Compare the number of requests created per month to the number completed.

– Compute monthly counts of requests created
WITH created AS (
SELECT date_trunc(‘month’, date_created) AS month,
count() AS created_count
FROM evanston311
WHERE category=’Rodents- Rats’
GROUP BY month),
– Compute monthly counts of requests completed
completed AS (
SELECT date_trunc(‘month’,date_completed) AS month,
count(
) AS completed_count
FROM evanston311
WHERE category=’Rodents- Rats’
GROUP BY month)
– Join monthly created and completed counts
SELECT created.month,
created_count,
completed_count
FROM created
INNER JOIN completed
ON created.month=completed.month
ORDER BY created.month;

69
Q

grouping sets : OLAP operators includes rollup and cube

A

GROUP BY GROUPING SETS
Example of a query with GROUPING SETS operator:

SELECT country,
genre,
COUNT (*)
FROM rentings_extended
GROUP BY GROUPING SETS ((country, genre), (country), (genre), ());
  • Column names surrounded by parentheses represent one level of aggregation.
    * GROUP BY GROUPING SETS returns a UNION over several GROUP BY queries.

equivalant to a group by cube (country and genre)