Automating Cheminformatics with Apache Airflow — Step 1: Preparing SMILES Transformations on Large Scale

python pip install apache-airflow
python pip install -r Cheminformatic-Airflow/requirements.txt
git clone
airflow db init

airflow users create \
--username admin \
--firstname Peter \
--lastname Parker \
--role Admin \

nohup airflow webserver --port 8080 &
nohup airflow scheduler &
cat airflow/ | xargs kill $1kill $(ps -ef | grep "airflow scheduler" | awk '{print $2}')
dags_folder = Cheminformatic-Airflow/dags
python -c "from airflow.models import DagBag; d = DagBag();"
# Task 1: To Convert the SMILES to SDF

# Imports
# -------
import os
import pandas as pd

# RDKit & Configurations
# ----------------------
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
from rdkit.Chem.Draw import DrawingOptions

# -------------

def smiles_to_sdf(row):

molecule = Chem.MolFromSmiles(row['smiles'])
molecule_with_hs = Chem.AddHs(molecule)
molecule_with_hs.SetProp('smiles', row['smiles'])

sdwriter = Chem.SDWriter(
str( + '.sdf')

def step_1():
smiles_dataframe = pd.read_csv(
_ = smiles_dataframe.apply(smiles_to_sdf, axis=1)
# Standard Python Internal Packages
# ---------------------------------
import os, sys
import pandas as pd
import datetime as dt

sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))

# Airflow & Configurations
# ------------------------
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator

default_args = {
'owner': 'sulstice',
'start_date': dt.datetime(2018, 9, 24, 10, 00, 00),
'concurrency': 1,
'retries': 0

# Airflow Steps
# -------------

from task_1.task_1 import *
from task_2.task_2 import *

with DAG('smiles_to_sdf',
schedule_interval='*/10 * * * *',
) as dag:

step1 = PythonOperator(

plugins_folder =  Cheminformatic-Airflow/plugins
from flask import Blueprint
from flask_appbuilder import expose, BaseView as AppBuilderBaseView

from airflow.plugins_manager import AirflowPlugin
from flask_admin.base import MenuLink

bp = Blueprint(
"plotting_plugins", __name__,
template_folder='templates', # registers airflow/plugins/templates as a Jinja template folder
static_url_path='/static/plotting_plugins ')

# Creating a flask appbuilder BaseView
class PCAAnalysisAppBuilderBaseView(AppBuilderBaseView):

template_folder = 'Cheminformatic-Airflow/plugins/plotting_plugins/templates'

def list(self):

return self.render_template("pca_analysis.html")

pca_analysis_appbuilder_view = PCAAnalysisAppBuilderBaseView()
pca_analysis_appbuilder_package = {
"name": "PCA Analysis",
"category": "Pipeline Plots",
"view": pca_analysis_appbuilder_view

# Defining the plugin class
class AirflowTestPlugin(AirflowPlugin):
name = "plotting_plugins"
# operators = []
# flask_blueprints = [bp]
# hooks = []
appbuilder_views = [pca_analysis_appbuilder_package]
# executors = []
# admin_views = []
# Pipeline Configurations
# -----------------------
morgan_radius = 1
bit_representation = 512

# Imports
# -------
import sys

# Scientific Imports
# ------------------
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# RDkit Imports
# -------------
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import Draw
from rdkit.Chem import DataStructs

# Graphing Imports
# ----------------

from bokeh.plotting import ColumnDataSource, figure, output_notebook, output_file, show, save
from import output_notebook, export_png
from bokeh.layouts import gridplot

# Global Configs
# --------------

TOOLTIPS = """<div>\nMolID: @ids<br>\n@img{safe}\n</div>\n"""
colormaps = { 0: '#e6194b', 1: '#3cb44b', 2: '#ffe119', 3: '#4363d8', 4: '#f58231', 5: '#911eb4'}

# Standard Functions
# ------------------

def mol2svg(mol):
d2d = rdMolDraw2D.MolDraw2DSVG(200,100)
return d2d.GetDrawingText()

def mol2fparr(mol):

global morgan_radius
global bit_representation

arr = np.zeros((0,))
fp = AllChem.GetMorganFingerprintAsBitVect(mol, morgan_radius, nBits=bit_representation)
DataStructs.ConvertToNumpyArray(fp, arr)
return arr

def step_2():

smiles_dataframe = pd.read_csv('/home/sulstice/airflow/dags/task_1/data/smiles.txt', sep='\n', header=None, names=['smiles'])
smiles_list = smiles_dataframe['smiles'].to_list()

molecules_list = [Chem.MolFromSmiles(i) for i in smiles_list]
fingerprints_list = np.array([mol2fparr(m) for m in molecules_list])

pca = PCA(n_components=0.95)
chemicalspace = pca.fit_transform(fingerprints_list)
kmean = KMeans(n_clusters=5, random_state=0)
kmeanc = [colormaps[i] for i in kmean.labels_]

kmean_data = dict(
img=[mol2svg(m) for m in molecules_list],
ids=[str(i) for i in range(0, len(smiles_list))],

source = ColumnDataSource(kmean_data)
plot = figure(plot_width=500, plot_height=500, tooltips=TOOLTIPS, title='error_compounds')'x', 'y',color='fill_color', size=10, fill_alpha=0.2,source=source)

plot = gridplot([

output_file(filename="Cheminformatic-Airflow/plugins/plotting_plugins/templates/pca_analysis.html", title="Static HTML file")





I’m a cheminformatician exploring chemical infinity,

