Automating Cheminformatics with Apache Airflow — Step 1: Preparing SMILES Transformations on Large Scale

python pip install apache-airflow
python pip install -r Cheminformatic-Airflow/requirements.txt
git clone
airflow db init

airflow users create \
--username admin \
--firstname Peter \
--lastname Parker \
--role Admin \

nohup airflow webserver --port 8080 &
nohup airflow scheduler &
cat airflow/ | xargs kill $1kill $(ps -ef | grep "airflow scheduler" | awk '{print $2}')
dags_folder = Cheminformatic-Airflow/dags
python -c "from airflow.models import DagBag; d = DagBag();"
# Task 1: To Convert the SMILES to SDF

# Imports
# -------
import os
import pandas as pd

# RDKit & Configurations
# ----------------------
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
from rdkit.Chem.Draw import DrawingOptions

# -------------

def smiles_to_sdf(row):

molecule = Chem.MolFromSmiles(row['smiles'])
molecule_with_hs = Chem.AddHs(molecule)
molecule_with_hs.SetProp('smiles', row['smiles'])

sdwriter = Chem.SDWriter(
str( + '.sdf')

def step_1():
smiles_dataframe = pd.read_csv(
_ = smiles_dataframe.apply(smiles_to_sdf, axis=1)
# Standard Python Internal Packages
# ---------------------------------
import os, sys
import pandas as pd
import datetime as dt

sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))

# Airflow & Configurations
# ------------------------
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator

default_args = {
'owner': 'sulstice',
'start_date': dt.datetime(2018, 9, 24, 10, 00, 00),
'concurrency': 1,
'retries': 0

# Airflow Steps
# -------------

from task_1.task_1 import *
from task_2.task_2 import *

with DAG('smiles_to_sdf',
schedule_interval='*/10 * * * *',
) as dag:

step1 = PythonOperator(

plugins_folder =  Cheminformatic-Airflow/plugins
from flask import Blueprint
from flask_appbuilder import expose, BaseView as AppBuilderBaseView

from airflow.plugins_manager import AirflowPlugin
from flask_admin.base import MenuLink

bp = Blueprint(
"plotting_plugins", __name__,
template_folder='templates', # registers airflow/plugins/templates as a Jinja template folder
static_url_path='/static/plotting_plugins ')

# Creating a flask appbuilder BaseView
class PCAAnalysisAppBuilderBaseView(AppBuilderBaseView):

template_folder = 'Cheminformatic-Airflow/plugins/plotting_plugins/templates'

def list(self):

return self.render_template("pca_analysis.html")

pca_analysis_appbuilder_view = PCAAnalysisAppBuilderBaseView()
pca_analysis_appbuilder_package = {
"name": "PCA Analysis",
"category": "Pipeline Plots",
"view": pca_analysis_appbuilder_view

# Defining the plugin class
class AirflowTestPlugin(AirflowPlugin):
name = "plotting_plugins"
# operators = []
# flask_blueprints = [bp]
# hooks = []
appbuilder_views = [pca_analysis_appbuilder_package]
# executors = []
# admin_views = []
# Pipeline Configurations
# -----------------------
morgan_radius = 1
bit_representation = 512

# Imports
# -------
import sys

# Scientific Imports
# ------------------
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# RDkit Imports
# -------------
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import Draw
from rdkit.Chem import DataStructs

# Graphing Imports
# ----------------

from bokeh.plotting import ColumnDataSource, figure, output_notebook, output_file, show, save
from import output_notebook, export_png
from bokeh.layouts import gridplot

# Global Configs
# --------------

TOOLTIPS = """<div>\nMolID: @ids<br>\n@img{safe}\n</div>\n"""
colormaps = { 0: '#e6194b', 1: '#3cb44b', 2: '#ffe119', 3: '#4363d8', 4: '#f58231', 5: '#911eb4'}

# Standard Functions
# ------------------

def mol2svg(mol):
d2d = rdMolDraw2D.MolDraw2DSVG(200,100)
return d2d.GetDrawingText()

def mol2fparr(mol):

global morgan_radius
global bit_representation

arr = np.zeros((0,))
fp = AllChem.GetMorganFingerprintAsBitVect(mol, morgan_radius, nBits=bit_representation)
DataStructs.ConvertToNumpyArray(fp, arr)
return arr

def step_2():

smiles_dataframe = pd.read_csv('/home/sulstice/airflow/dags/task_1/data/smiles.txt', sep='\n', header=None, names=['smiles'])
smiles_list = smiles_dataframe['smiles'].to_list()

molecules_list = [Chem.MolFromSmiles(i) for i in smiles_list]
fingerprints_list = np.array([mol2fparr(m) for m in molecules_list])

pca = PCA(n_components=0.95)
chemicalspace = pca.fit_transform(fingerprints_list)
kmean = KMeans(n_clusters=5, random_state=0)
kmeanc = [colormaps[i] for i in kmean.labels_]

kmean_data = dict(
img=[mol2svg(m) for m in molecules_list],
ids=[str(i) for i in range(0, len(smiles_list))],

source = ColumnDataSource(kmean_data)
plot = figure(plot_width=500, plot_height=500, tooltips=TOOLTIPS, title='error_compounds')'x', 'y',color='fill_color', size=10, fill_alpha=0.2,source=source)

plot = gridplot([

output_file(filename="Cheminformatic-Airflow/plugins/plotting_plugins/templates/pca_analysis.html", title="Static HTML file")





I’m a cheminformatician exploring chemical infinity,

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

Making API integration painless

It is possible to easily manage payments with the integration of Marketplace Iyzico

How Much Does It Cost To Make a Flutter App For Your Business?

Axelar Network — what is it?

NG3. What would be in a sentence structure?

Welcome José Manuel Cantera to the IOTA Foundation

Sprint Blog #5

Building a business from scratch — day 7

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Suliman Sharif

Suliman Sharif

I’m a cheminformatician exploring chemical infinity,

More from Medium

Graph Databases and Object Graph Mapping with neo4j and python

Validating SMILES with RDKit, PySMILES, MolVS, and PartialSMILES

Perfect Way of Versioning Models & Training Data

Discover AuraDB Free: Importing GEDCOM Files and Exploring Genealogy/Ancestry Data as a Graph