2. 3+ years in startups
Built systems
supporting 1mil+
users
Delivered to Fortune
10 companies
Greg Goltsov
Data Hacker
gregory.goltsov.info
@gregoltsov
20. WITH new_users AS (SELECT ...),
unverified_users_ids AS (SELECT ...)
SELECT COUNT(new_user.id)
FROM new_user
WHERE new_user.id NOT IN
unverified_users_ids;
Postgres WITH
41. !"" Makefile <- Makefile with commands like `make data` or `make train`
!"" data
# !"" external <- Data from third party sources.
# !"" interim <- Intermediate data that has been transformed.
# !"" processed <- The final, canonical data sets for modeling.
# $"" raw <- The original, immutable data dump.
!"" docs <- A default Sphinx project; see sphinx-doc.org for details
!"" models <- Trained and serialized models, model predictions
!"" notebooks <- Jupyter notebooks
!"" references <- Data dictionaries, manuals, and all other explanatory materials.
!"" reports <- Generated analysis as HTML, PDF, LaTeX, etc.
!"" requirements.txt <- The requirements file for reproducing the env
!"" src <- Source code for use in this project.
# !"" data <- Scripts to download or generate data
# # $"" make_dataset.py
# !"" features <- Scripts to turn raw data into features for modeling
# # $"" build_features.py
# !"" models <- Scripts to train models and then use trained models to make
# # # predictions
# # !"" predict_model.py
# # $"" train_model.py
# $"" visualization <- Scripts to create exploratory and results oriented visualizations
# $"" visualize.py
42. www.dataset.readthedocs.io
Just write SQL
# connect, return rows as objects with attributes
db = dataset.connect(‘postgresql://…', row_type=stuf)
rows = db.query('SELECT country, COUNT(*) c
FROM user GROUP BY country')
# get data into pandas, that's where the fun begins!
rows_df = pandas.DataFrame.from_records(rows)
43. # sklearn-pandas
mapper = DataFrameMapper([
(['country'], [sklearn.preprocessing.Imputer(),
sklearn.preprocessing.StandardScaler()]),
...])
# pipeline to convert DataFrame to ML representation
pipeline = sklearn.pipeline.Pipeline([
('featurise', mapper),
('feature_selection', feature_selection.SelectKBest()),
('random_forest', ensemble.RandomForestClassifier())])
# set up search space for the best model
cv = grid_search.GridSearchCV(pipeline, param_grid=dict(…))
best_model = cv.best_estimator_