提交 d50d1fa4 作者: 杨锋

Initial commit

上级
The source code of retrosynthesis prediction.
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e067f835-4f8e-4c04-885a-dad88ad2cb8c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from rdchiral import template_extractor\n",
"from tqdm import tqdm\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5a8343ed-cb20-42f6-9ee7-f736014bc228",
"metadata": {},
"outputs": [],
"source": [
"raw_train = pd.read_csv('raw_train.csv').values\n",
"raw_test = pd.read_csv('raw_test.csv').values\n",
"raw_val = pd.read_csv('raw_val.csv').values"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "96bfba27-3dae-42ad-bbdd-4ec3136e80e4",
"metadata": {},
"outputs": [],
"source": [
"reactions_train = [{'_id': reaction[0], 'reactants': reaction[2].split('>>')[0], 'products': reaction[2].split('>>')[1]} for reaction in raw_train]\n",
"reactions_test = [{'_id': reaction[0], 'reactants': reaction[2].split('>>')[0], 'products': reaction[2].split('>>')[1]} for reaction in raw_test]\n",
"reactions_val = [{'_id': reaction[0], 'reactants': reaction[2].split('>>')[0], 'products': reaction[2].split('>>')[1]} for reaction in raw_val]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5bc462f3-f7bc-472c-a22e-ea7626dffac3",
"metadata": {},
"outputs": [],
"source": [
"def extract(reaction):\n",
" try:\n",
" print(reaction)\n",
" return template_extractor.extract_from_reaction(reaction)\n",
" except KeyboardInterrupt:\n",
" print('Interrupted')\n",
" raise KeyboardInterrupt\n",
" except Exception as e:\n",
" print(e)\n",
" return {'reaction_id': reaction['_id']}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5ac8960d-b398-4794-b95e-5c876dee7664",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/40008 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US05849732', 'reactants': 'O=C(OCc1ccccc1)[NH:1][CH2:2][CH2:3][CH2:4][CH2:5][C@@H:6]([C:7]([O:8][CH3:9])=[O:10])[NH:11][C:12](=[O:13])[NH:14][c:15]1[cH:16][c:17]([O:18][CH3:19])[cH:20][c:21]([C:22]([CH3:23])([CH3:24])[CH3:25])[c:26]1[OH:27]', 'products': '[NH2:1][CH2:2][CH2:3][CH2:4][CH2:5][C@@H:6]([C:7]([O:8][CH3:9])=[O:10])[NH:11][C:12](=[O:13])[NH:14][c:15]1[cH:16][c:17]([O:18][CH3:19])[cH:20][c:21]([C:22]([CH3:23])([CH3:24])[CH3:25])[c:26]1[OH:27]'}\n",
"{'_id': 'US20120114765A1', 'reactants': 'O[C:1](=[O:2])[c:3]1[cH:4][c:5]([N+:6](=[O:7])[O-:8])[c:9]([S:10][c:11]2[c:12]([Cl:13])[cH:14][n:15][cH:16][c:17]2[Cl:18])[s:19]1.[NH2:20][c:21]1[cH:22][cH:23][cH:24][c:25]2[cH:26][n:27][cH:28][cH:29][c:30]12', 'products': '[C:1](=[O:2])([c:3]1[cH:4][c:5]([N+:6](=[O:7])[O-:8])[c:9]([S:10][c:11]2[c:12]([Cl:13])[cH:14][n:15][cH:16][c:17]2[Cl:18])[s:19]1)[NH:20][c:21]1[cH:22][cH:23][cH:24][c:25]2[cH:26][n:27][cH:28][cH:29][c:30]12'}\n",
"{'_id': 'US08003648B2', 'reactants': 'O=[CH:1][c:2]1[cH:3][cH:4][c:5](-[c:6]2[n:7][c:8]([CH3:9])[c:10]([CH2:11][O:12][c:13]3[cH:14][cH:15][c:16]([C@H:17]([CH2:18][C:19](=[O:20])[N:21]4[C:22](=[O:23])[O:24][CH2:25][C@@H:26]4[CH2:27][c:28]4[cH:29][cH:30][cH:31][cH:32][cH:33]4)[c:34]4[cH:35][cH:36][o:37][n:38]4)[cH:39][cH:40]3)[s:41]2)[cH:42][cH:43]1.[CH3:44][CH2:45][NH:46][CH2:47][CH3:48]', 'products': '[CH2:1]([c:2]1[cH:3][cH:4][c:5](-[c:6]2[n:7][c:8]([CH3:9])[c:10]([CH2:11][O:12][c:13]3[cH:14][cH:15][c:16]([C@H:17]([CH2:18][C:19](=[O:20])[N:21]4[C:22](=[O:23])[O:24][CH2:25][C@@H:26]4[CH2:27][c:28]4[cH:29][cH:30][cH:31][cH:32][cH:33]4)[c:34]4[cH:35][cH:36][o:37][n:38]4)[cH:39][cH:40]3)[s:41]2)[cH:42][cH:43]1)[N:46]([CH2:45][CH3:44])[CH2:47][CH3:48]'}\n",
"{'_id': 'US09045475B2', 'reactants': 'O=[C:1]([CH2:2][F:3])[CH2:4][F:5].[CH3:6][C:7]1([CH3:8])[CH2:9][CH2:10][C:11]([CH2:12][N:13]2[CH2:14][CH2:15][N:16]([c:17]3[cH:18][cH:19][c:20]([C:21](=[O:22])[NH:23][S:24](=[O:25])(=[O:26])[c:27]4[cH:28][cH:29][c:30]([NH:31][CH2:32][CH:33]5[CH2:34][NH:35][CH2:36]5)[c:37]([N+:38](=[O:39])[O-:40])[cH:41]4)[c:42]([O:43][c:44]4[cH:45][n:46][c:47]5[nH:48][cH:49][cH:50][c:51]5[cH:52]4)[cH:53]3)[CH2:54][CH2:55]2)=[C:56]([c:57]2[cH:58][cH:59][c:60]([Cl:61])[cH:62][cH:63]2)[CH2:64]1', 'products': '[CH:1]([CH2:2][F:3])([CH2:4][F:5])[N:35]1[CH2:34][CH:33]([CH2:32][NH:31][c:30]2[cH:29][cH:28][c:27]([S:24]([NH:23][C:21]([c:20]3[cH:19][cH:18][c:17]([N:16]4[CH2:15][CH2:14][N:13]([CH2:12][C:11]5=[C:56]([c:57]6[cH:58][cH:59][c:60]([Cl:61])[cH:62][cH:63]6)[CH2:64][C:7]([CH3:6])([CH3:8])[CH2:9][CH2:10]5)[CH2:55][CH2:54]4)[cH:53][c:42]3[O:43][c:44]3[cH:45][n:46][c:47]4[nH:48][cH:49][cH:50][c:51]4[cH:52]3)=[O:22])(=[O:25])=[O:26])[cH:41][c:37]2[N+:38](=[O:39])[O-:40])[CH2:36]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 4/40008 [00:00<24:20, 27.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US08188098B2', 'reactants': 'Cl[C:1](=[O:2])[O:3][CH:4]1[CH2:5][CH2:6][CH2:7][CH2:8]1.[CH3:9][CH2:10][O:11][c:12]1[cH:13][cH:14][c:15]([O:16][c:17]2[n:18][cH:19][n:20][c:21]3[c:22]2[cH:23][n:24][n:25]3[CH:26]2[CH2:27][CH2:28][NH:29][CH2:30][CH2:31]2)[c:32]([F:33])[cH:34]1', 'products': '[C:1](=[O:2])([O:3][CH:4]1[CH2:5][CH2:6][CH2:7][CH2:8]1)[N:29]1[CH2:28][CH2:27][CH:26]([n:25]2[c:21]3[n:20][cH:19][n:18][c:17]([O:16][c:15]4[cH:14][cH:13][c:12]([O:11][CH2:10][CH3:9])[cH:34][c:32]4[F:33])[c:22]3[cH:23][n:24]2)[CH2:31][CH2:30]1'}\n",
"{'_id': 'US20140275084A1', 'reactants': 'Br[c:1]1[n:2]([CH3:3])[cH:4][n:5][c:6]1-[c:7]1[cH:8][c:9]([C:10]#[N:11])[cH:12][cH:13][n:14]1.OB(O)[c:15]1[cH:16][cH:17][c:18](-[n:19]2[cH:20][cH:21][cH:22][n:23]2)[cH:24][cH:25]1', 'products': '[c:1]1(-[c:15]2[cH:16][cH:17][c:18](-[n:19]3[cH:20][cH:21][cH:22][n:23]3)[cH:24][cH:25]2)[n:2]([CH3:3])[cH:4][n:5][c:6]1-[c:7]1[cH:8][c:9]([C:10]#[N:11])[cH:12][cH:13][n:14]1'}\n",
"{'_id': 'US20110224204A1', 'reactants': 'Br[c:1]1[c:2]([O:3][CH:4]2[CH2:5][CH2:6][CH2:7][CH2:8][O:9]2)[cH:10][cH:11][c:12]([C:13]#[N:14])[cH:15]1.CC1(C)OB([c:16]2[cH:17][cH:18][c:19]([O:20][CH2:21][c:22]3[cH:23][cH:24][c:25]4[cH:26][cH:27][cH:28][cH:29][c:30]4[n:31]3)[cH:32][cH:33]2)OC1(C)C', 'products': '[c:1]1(-[c:16]2[cH:17][cH:18][c:19]([O:20][CH2:21][c:22]3[cH:23][cH:24][c:25]4[cH:26][cH:27][cH:28][cH:29][c:30]4[n:31]3)[cH:32][cH:33]2)[c:2]([O:3][CH:4]2[CH2:5][CH2:6][CH2:7][CH2:8][O:9]2)[cH:10][cH:11][c:12]([C:13]#[N:14])[cH:15]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 9/40008 [00:00<18:13, 36.57it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20130005716A1', 'reactants': '[CH3:1][O:2][c:3]1[cH:4][cH:5][c:6]([C:7](=[O:8])[CH2:9][c:10]2[c:11]([Cl:12])[cH:13][n+:14]([O-:15])[cH:16][c:17]2[Cl:18])[c:19]2[c:20]1[O:21][C:22]1([CH2:23][CH2:24][CH2:25][CH2:26]1)[O:27]2', 'products': '[CH3:1][O:2][c:3]1[cH:4][cH:5][c:6]([CH:7]([OH:8])[CH2:9][c:10]2[c:11]([Cl:12])[cH:13][n+:14]([O-:15])[cH:16][c:17]2[Cl:18])[c:19]2[c:20]1[O:21][C:22]1([CH2:23][CH2:24][CH2:25][CH2:26]1)[O:27]2'}\n",
"{'_id': 'US20130172384A1', 'reactants': 'O[C:1](=[O:2])[c:3]1[cH:4][cH:5][c:6]([Br:7])[cH:8][c:9]1[Cl:10].[CH3:11][C:12]([CH3:13])([CH3:14])[O:15][C:16](=[O:17])[NH:18][NH2:19]', 'products': '[C:1](=[O:2])([c:3]1[cH:4][cH:5][c:6]([Br:7])[cH:8][c:9]1[Cl:10])[NH:19][NH:18][C:16]([O:15][C:12]([CH3:11])([CH3:13])[CH3:14])=[O:17]'}\n",
"{'_id': 'US20050277619A1', 'reactants': 'Br[CH2:1][c:2]1[n:3][c:4]2[c:5]([NH2:6])[n:7][cH:8][n:9][c:10]2[n:11]1[CH2:12][CH2:13][c:14]1[cH:15][cH:16][cH:17][cH:18][cH:19]1.[CH3:20][CH2:21][O:22][P:23](=[O:24])([CH2:25][OH:26])[O:27][CH2:28][CH3:29]', 'products': '[CH2:1]([c:2]1[n:3][c:4]2[c:5]([NH2:6])[n:7][cH:8][n:9][c:10]2[n:11]1[CH2:12][CH2:13][c:14]1[cH:15][cH:16][cH:17][cH:18][cH:19]1)[O:26][CH2:25][P:23]([O:22][CH2:21][CH3:20])(=[O:24])[O:27][CH2:28][CH3:29]'}\n",
"{'_id': 'US20110263607A1', 'reactants': 'CC(C)(C)[O:1][C:2](=[O:3])[CH2:4][O:5][NH:6][C:7](=[O:8])[NH:9][CH2:10][c:11]1[cH:12][cH:13][cH:14][c:15]2[cH:16][cH:17][cH:18][cH:19][c:20]12', 'products': '[OH:1][C:2](=[O:3])[CH2:4][O:5][NH:6][C:7](=[O:8])[NH:9][CH2:10][c:11]1[cH:12][cH:13][cH:14][c:15]2[cH:16][cH:17][cH:18][cH:19][c:20]12'}\n",
"{'_id': 'US06372735B1', 'reactants': 'COc1ccc(C[n:1]2[n:2][n:3][c:4]([C:5]([O:6][CH2:7][CH3:8])=[O:9])[c:10]2[C:11](=[O:12])[c:13]2[cH:14][cH:15][c:16]([O:17][CH3:18])[c:19]([O:20][CH3:21])[cH:22]2)cc1', 'products': '[nH:1]1[n:2][n:3][c:4]([C:5]([O:6][CH2:7][CH3:8])=[O:9])[c:10]1[C:11](=[O:12])[c:13]1[cH:14][cH:15][c:16]([O:17][CH3:18])[c:19]([O:20][CH3:21])[cH:22]1'}\n",
"{'_id': 'US20100003305A1', 'reactants': 'O[C:1]([C@H:2]([CH:3]([CH3:4])[CH3:5])[NH:6][C:7](=[O:8])[CH2:9][NH:10][C:11](=[O:12])[C@@H:13]1[CH2:14][CH2:15][CH2:16][N:17]1[C:18](=[O:19])[C@@H:20]([NH:21][C:22](=[O:23])[O:24][CH2:25][c:26]1[cH:27][cH:28][cH:29][cH:30][cH:31]1)[CH:32]([CH3:33])[CH3:34])=[O:35].[CH3:36][O:37][C:38](=[O:39])[CH2:40][NH2:41]', 'products': '[C:1]([C@H:2]([CH:3]([CH3:4])[CH3:5])[NH:6][C:7](=[O:8])[CH2:9][NH:10][C:11](=[O:12])[C@@H:13]1[CH2:14][CH2:15][CH2:16][N:17]1[C:18](=[O:19])[C@@H:20]([NH:21][C:22](=[O:23])[O:24][CH2:25][c:26]1[cH:27][cH:28][cH:29][cH:30][cH:31]1)[CH:32]([CH3:33])[CH3:34])(=[O:35])[NH:41][CH2:40][C:38]([O:37][CH3:36])=[O:39]'}\n",
"{'_id': 'US07820673B2', 'reactants': 'O=[CH:1][c:2]1[cH:3][nH:4][cH:5][n:6]1.[NH2:7][CH:8]1[CH2:9][CH2:10][N:11]([CH2:12][c:13]2[cH:14][cH:15][cH:16][cH:17][cH:18]2)[CH2:19][CH2:20]1', 'products': '[CH2:1]([c:2]1[cH:3][nH:4][cH:5][n:6]1)[NH:7][CH:8]1[CH2:9][CH2:10][N:11]([CH2:12][c:13]2[cH:14][cH:15][cH:16][cH:17][cH:18]2)[CH2:19][CH2:20]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 14/40008 [00:00<16:34, 40.23it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20050148627A1', 'reactants': 'Br[CH2:1][c:2]1[cH:3][cH:4][cH:5][cH:6][cH:7]1.[OH:8][c:9]1[cH:10][cH:11][c:12]([OH:13])[c:14]([Br:15])[cH:16]1', 'products': '[CH2:1]([c:2]1[cH:3][cH:4][cH:5][cH:6][cH:7]1)[O:13][c:12]1[cH:11][cH:10][c:9]([OH:8])[cH:16][c:14]1[Br:15]'}\n",
"{'_id': 'US20070225349A1', 'reactants': 'C[O:1][c:2]1[cH:3][cH:4][c:5](-[c:6]2[n:7][n:8]([CH:9]([CH3:10])[CH3:11])[c:12]3[c:13]([Cl:14])[cH:15][cH:16][cH:17][c:18]23)[cH:19][cH:20]1', 'products': '[OH:1][c:2]1[cH:3][cH:4][c:5](-[c:6]2[n:7][n:8]([CH:9]([CH3:10])[CH3:11])[c:12]3[c:13]([Cl:14])[cH:15][cH:16][cH:17][c:18]23)[cH:19][cH:20]1'}\n",
"{'_id': 'US05834461', 'reactants': 'Cl[C:1]([c:2]1[c:3]([CH3:4])[cH:5][cH:6][cH:7][cH:8]1)=[O:9].[NH2:10][c:11]1[cH:12][cH:13][c:14]([C:15](=[O:16])[N:17]2[CH2:18][c:19]3[cH:20][cH:21][cH:22][cH:23][c:24]3[S:25][c:26]3[n:27][cH:28][cH:29][cH:30][c:31]32)[cH:32][cH:33]1', 'products': '[C:1]([c:2]1[c:3]([CH3:4])[cH:5][cH:6][cH:7][cH:8]1)(=[O:9])[NH:10][c:11]1[cH:12][cH:13][c:14]([C:15](=[O:16])[N:17]2[CH2:18][c:19]3[cH:20][cH:21][cH:22][cH:23][c:24]3[S:25][c:26]3[n:27][cH:28][cH:29][cH:30][c:31]32)[cH:32][cH:33]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 20/40008 [00:00<15:02, 44.30it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20150057271A1', 'reactants': 'F[c:1]1[cH:2][cH:3][c:4]([C:5]([F:6])([F:7])[F:8])[cH:9][c:10]1[Br:11].[cH:12]1[cH:13][cH:14][c:15]2[c:16]([cH:17]1)[NH:18][CH2:19][CH2:20][O:21]2', 'products': '[c:1]1([N:18]2[c:16]3[c:15]([cH:14][cH:13][cH:12][cH:17]3)[O:21][CH2:20][CH2:19]2)[cH:2][cH:3][c:4]([C:5]([F:6])([F:7])[F:8])[cH:9][c:10]1[Br:11]'}\n",
"{'_id': 'US05612288', 'reactants': 'F[c:1]1[c:2]([F:3])[cH:4][cH:5][cH:6][n:7]1.[CH3:8][CH:9]([CH3:10])[SH:11]', 'products': '[c:1]1([S:11][CH:9]([CH3:8])[CH3:10])[c:2]([F:3])[cH:4][cH:5][cH:6][n:7]1'}\n",
"{'_id': 'US20120202797A1', 'reactants': 'Cl[C:1]1=[N:2][c:3]2[c:4]([cH:5][c:6]([Cl:7])[cH:8][cH:9]2)[CH2:10][n:11]2[c:12]1[cH:13][c:14]([CH3:15])[cH:16]2.[CH3:17][O:18][C:19](=[O:20])[C:21]([CH3:22])([CH3:23])[CH2:24][N:25]1[CH2:26][CH2:27][NH:28][CH2:29][CH2:30]1', 'products': '[C:1]1([N:28]2[CH2:27][CH2:26][N:25]([CH2:24][C:21]([C:19]([O:18][CH3:17])=[O:20])([CH3:22])[CH3:23])[CH2:30][CH2:29]2)=[N:2][c:3]2[c:4]([cH:5][c:6]([Cl:7])[cH:8][cH:9]2)[CH2:10][n:11]2[c:12]1[cH:13][c:14]([CH3:15])[cH:16]2'}\n",
"{'_id': 'US05866589', 'reactants': 'c1ccc(C[N:1]2[CH2:2][CH2:3][CH:4]([N:5]([CH2:6][CH3:7])[c:8]3[n:9][c:10]([F:11])[cH:12][cH:13][c:14]3[NH:15][CH:16]([CH3:17])[CH3:18])[CH2:19][CH2:20]2)cc1', 'products': '[NH:1]1[CH2:2][CH2:3][CH:4]([N:5]([CH2:6][CH3:7])[c:8]2[n:9][c:10]([F:11])[cH:12][cH:13][c:14]2[NH:15][CH:16]([CH3:17])[CH3:18])[CH2:19][CH2:20]1'}\n",
"{'_id': 'US20110152240A1', 'reactants': 'O=[CH:1][c:2]1[cH:3][n:4][n:5]2[c:6]([NH:7][CH:8]3[CH2:9][CH2:10]3)[cH:11][c:12]([NH:13][CH:14]3[CH2:15][CH2:16][CH2:17][CH2:18]3)[n:19][c:20]12.[O:21]=[C:22]1[CH2:23][NH:24][C:25](=[O:26])[NH:27]1', 'products': '[CH:1](\\\\[c:2]1[cH:3][n:4][n:5]2[c:6]([NH:7][CH:8]3[CH2:9][CH2:10]3)[cH:11][c:12]([NH:13][CH:14]3[CH2:15][CH2:16][CH2:17][CH2:18]3)[n:19][c:20]12)=[C:23]1/[C:22](=[O:21])[NH:27][C:25](=[O:26])[NH:24]1'}\n",
"{'_id': 'US20060217346A1', 'reactants': 'Br[CH:1]([C:2]([O:3][C:4]([CH3:5])([CH3:6])[CH3:7])=[O:8])[O:9][c:10]1[cH:11][c:12]([Cl:13])[cH:14][c:15]([Cl:16])[cH:17]1.[CH3:18][O-:19]', 'products': '[CH:1]([C:2]([O:3][C:4]([CH3:5])([CH3:6])[CH3:7])=[O:8])([O:9][c:10]1[cH:11][c:12]([Cl:13])[cH:14][c:15]([Cl:16])[cH:17]1)[O:19][CH3:18]'}\n",
"{'_id': 'US09133104B2', 'reactants': 'Br[CH2:1][c:2]1[cH:3][cH:4][cH:5][c:6]([N+:7](=[O:8])[O-:9])[cH:10]1.[NH3:11]', 'products': '[CH2:1]([c:2]1[cH:3][cH:4][cH:5][c:6]([N+:7](=[O:8])[O-:9])[cH:10]1)[NH2:11]'}\n",
"{'_id': 'US20040224976A1', 'reactants': 'C[Si](C)(C)[C:1]#[C:2][c:3]1[cH:4][o:5][c:6]2[cH:7][cH:8][c:9]([C:10](=[O:11])[NH:12][C@H:13]3[CH2:14][N:15]4[CH2:16][CH2:17][CH:18]3[CH2:19][CH2:20]4)[cH:21][c:22]12', 'products': '[CH:1]#[C:2][c:3]1[cH:4][o:5][c:6]2[cH:7][cH:8][c:9]([C:10](=[O:11])[NH:12][C@H:13]3[CH2:14][N:15]4[CH2:16][CH2:17][CH:18]3[CH2:19][CH2:20]4)[cH:21][c:22]12'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 25/40008 [00:00<14:37, 45.59it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US05872117', 'reactants': 'Cl[S:1]([CH3:2])(=[O:3])=[O:4].[CH3:5][C:6](=[O:7])[O:8][CH:9]([CH2:10][O:11][CH2:12][CH2:13][OH:14])[c:15]1[cH:16][cH:17][cH:18][c:19]([Cl:20])[cH:21]1', 'products': '[S:1]([CH3:2])(=[O:3])(=[O:4])[O:14][CH2:13][CH2:12][O:11][CH2:10][CH:9]([O:8][C:6]([CH3:5])=[O:7])[c:15]1[cH:16][cH:17][cH:18][c:19]([Cl:20])[cH:21]1'}\n",
"{'_id': 'US20090286778A1', 'reactants': '[Cl:8][c:9]1[cH:10][n:11][c:12]2[n:13][c:14]1[NH:15][c:16]1[cH:17][cH:18][c:19]([O:20][CH2:21][CH2:22][CH:23]3[CH2:24][CH2:25][NH:26][CH2:27][CH2:28]3)[c:29]([cH:30]1)[CH2:31][CH2:32][c:33]1[cH:34][n:35][cH:36][c:37]([cH:38]1)[NH:39]2.[O:40]=[C:41]=[N:42][CH2:43][c:44]1[cH:45][cH:46][cH:47][o:48]1', 'products': '[Cl:8][c:9]1[cH:10][n:11][c:12]2[n:13][c:14]1[NH:15][c:16]1[cH:17][cH:18][c:19]([O:20][CH2:21][CH2:22][CH:23]3[CH2:24][CH2:25][N:26]([C:41](=[O:40])[NH:42][CH2:43][c:44]4[cH:45][cH:46][cH:47][o:48]4)[CH2:27][CH2:28]3)[c:29]([cH:30]1)[CH2:31][CH2:32][c:33]1[cH:34][n:35][cH:36][c:37]([cH:38]1)[NH:39]2'}\n",
"{'_id': 'US05556977', 'reactants': 'Br[c:1]1[cH:2][cH:3][c:4]([O:5][CH2:6][c:7]2[cH:8][cH:9][cH:10][cH:11][cH:12]2)[cH:13][cH:14]1.[O:15]=[C:16]1[CH2:17][N:18]([c:19]2[cH:20][cH:21][n:22][cH:23][cH:24]2)[CH2:25][CH2:26][NH:27]1', 'products': '[c:1]1([N:27]2[C:16](=[O:15])[CH2:17][N:18]([c:19]3[cH:20][cH:21][n:22][cH:23][cH:24]3)[CH2:25][CH2:26]2)[cH:2][cH:3][c:4]([O:5][CH2:6][c:7]2[cH:8][cH:9][cH:10][cH:11][cH:12]2)[cH:13][cH:14]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 30/40008 [00:00<14:25, 46.19it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20030232832A1', 'reactants': 'O[CH:1]([CH2:2][CH3:3])[c:4]1[n:5][n:6]2[cH:7][cH:8][cH:9][c:10]2[c:11](=[O:12])[n:13]1[CH2:14][c:15]1[cH:16][cH:17][cH:18][cH:19][cH:20]1.[CH3:21][C:22]([CH3:23])([CH3:24])[O:25][C:26](=[O:27])[NH:28][CH2:29][CH2:30][CH2:31][NH2:32]', 'products': '[CH:1]([CH2:2][CH3:3])([c:4]1[n:5][n:6]2[cH:7][cH:8][cH:9][c:10]2[c:11](=[O:12])[n:13]1[CH2:14][c:15]1[cH:16][cH:17][cH:18][cH:19][cH:20]1)[NH:32][CH2:31][CH2:30][CH2:29][NH:28][C:26]([O:25][C:22]([CH3:21])([CH3:23])[CH3:24])=[O:27]'}\n",
"{'_id': 'US08258134B2', 'reactants': 'C[O:1][C:2](=[O:3])[CH:4]([CH2:5][CH:6]1[CH2:7][CH2:8][CH2:9][CH2:10]1)[n:11]1[n:12][cH:13][c:14]([O:15][c:16]2[cH:17][cH:18][cH:19][cH:20][c:21]2[C:22]([CH3:23])=[O:24])[cH:25][c:26]1=[O:27]', 'products': '[OH:1][C:2](=[O:3])[CH:4]([CH2:5][CH:6]1[CH2:7][CH2:8][CH2:9][CH2:10]1)[n:11]1[n:12][cH:13][c:14]([O:15][c:16]2[cH:17][cH:18][cH:19][cH:20][c:21]2[C:22]([CH3:23])=[O:24])[cH:25][c:26]1=[O:27]'}\n",
"{'_id': 'US20110092461A1', 'reactants': 'O[C:1](=[O:2])[c:3]1[cH:4][cH:5][c:6](-[n:7]2[cH:8][n:9][cH:10][n:11]2)[n:12][cH:13]1.[NH2:14][c:15]1[cH:16][cH:17][c:18]([Cl:19])[c:20](-[c:21]2[cH:22][cH:23][cH:24][cH:25][n:26]2)[cH:27]1', 'products': '[C:1](=[O:2])([c:3]1[cH:4][cH:5][c:6](-[n:7]2[cH:8][n:9][cH:10][n:11]2)[n:12][cH:13]1)[NH:14][c:15]1[cH:16][cH:17][c:18]([Cl:19])[c:20](-[c:21]2[cH:22][cH:23][cH:24][cH:25][n:26]2)[cH:27]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 35/40008 [00:00<14:05, 47.26it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US04298535', 'reactants': '[CH2:1]=[CH:2][C:3]([CH3:4])=[O:5].[CH2:6]=[C:7]1[C:8](=[CH2:9])[CH:10]2[O:11][CH:12]1[C:13](=[CH2:14])[C:15]2=[CH2:16]', 'products': '[CH2:1]1[CH:2]([C:3]([CH3:4])=[O:5])[CH2:6][C:7]2=[C:8]([CH2:9]1)[CH:10]1[O:11][CH:12]2[C:13](=[CH2:14])[C:15]1=[CH2:16]'}\n",
"{'_id': 'US06051731', 'reactants': 'O=[CH:1][CH2:2][c:3]1[cH:4][cH:5][c:6]([Br:7])[cH:8][cH:9]1.c1ccc(P(c2ccccc2)(c2ccccc2)=[CH:10][C:11]([O:12][CH2:13][CH3:14])=[O:15])cc1', 'products': '[CH:1]([CH2:2][c:3]1[cH:4][cH:5][c:6]([Br:7])[cH:8][cH:9]1)=[CH:10][C:11]([O:12][CH2:13][CH3:14])=[O:15]'}\n",
"{'_id': 'US20120142685A1', 'reactants': 'Cl[c:1]1[n:2][cH:3][cH:4][c:5](-[c:6]2[n:7][c:8]([N:9]3[CH2:10][CH2:11][N:12]([C:13]([O:14][C:15]([CH3:16])([CH3:17])[CH3:18])=[O:19])[CH2:20][CH2:21]3)[c:22]3[c:23]([cH:24]2)[cH:25][n:26][cH:27][cH:28]3)[cH:29]1.[NH2:30][CH:31]1[CH2:32][CH2:33][O:34][CH2:35][CH2:36]1', 'products': '[c:1]1([NH:30][CH:31]2[CH2:32][CH2:33][O:34][CH2:35][CH2:36]2)[n:2][cH:3][cH:4][c:5](-[c:6]2[n:7][c:8]([N:9]3[CH2:10][CH2:11][N:12]([C:13]([O:14][C:15]([CH3:16])([CH3:17])[CH3:18])=[O:19])[CH2:20][CH2:21]3)[c:22]3[c:23]([cH:24]2)[cH:25][n:26][cH:27][cH:28]3)[cH:29]1'}\n",
"{'_id': 'US08664216B2', 'reactants': 'O=[CH2:1].[Cl:2][c:3]1[cH:4][cH:5][c:6]2[c:7]([cH:8]1)[CH2:9][NH:10][CH2:11][c:12]1[n:13][n:14][c:15]([Br:16])[n:17]1-2', 'products': '[CH3:1][N:10]1[CH2:9][c:7]2[c:6]([cH:5][cH:4][c:3]([Cl:2])[cH:8]2)-[n:17]2[c:12]([n:13][n:14][c:15]2[Br:16])[CH2:11]1'}\n",
"{'_id': 'US04564609', 'reactants': 'O[C:1](=[O:2])[C@@H:3]1[CH2:4][CH2:5][CH2:6][N:7]1[C:8](=[O:9])[O:10][CH2:11][c:12]1[cH:13][cH:14][cH:15][cH:16][cH:17]1.[CH3:18][NH:19][CH3:20]', 'products': '[C:1](=[O:2])([C@@H:3]1[CH2:4][CH2:5][CH2:6][N:7]1[C:8](=[O:9])[O:10][CH2:11][c:12]1[cH:13][cH:14][cH:15][cH:16][cH:17]1)[N:19]([CH3:18])[CH3:20]'}\n",
"{'_id': 'US20120016029A1', 'reactants': 'Br[CH2:1][c:2]1[c:3]([O:4][c:5]2[c:6]([O:7][CH3:8])[cH:9][cH:10][c:11]([CH2:12][C:13]([O:14][CH2:15][CH3:16])=[O:17])[cH:18]2)[cH:19][cH:20][c:21]([N+:22](=[O:23])[O-:24])[cH:25]1.[F:26][C:27]([F:28])([F:29])[CH2:30][SH:31]', 'products': '[CH2:1]([c:2]1[c:3]([O:4][c:5]2[c:6]([O:7][CH3:8])[cH:9][cH:10][c:11]([CH2:12][C:13]([O:14][CH2:15][CH3:16])=[O:17])[cH:18]2)[cH:19][cH:20][c:21]([N+:22](=[O:23])[O-:24])[cH:25]1)[S:31][CH2:30][C:27]([F:26])([F:28])[F:29]'}\n",
"{'_id': 'US07964613B2', 'reactants': 'Cl[S:1](=[O:2])(=[O:3])[c:4]1[cH:5][cH:6][cH:7][c:8]2[cH:9][n:10][cH:11][c:12]([Cl:13])[c:14]12.[CH3:15][C:16]([CH3:17])([CH3:18])[O:19][C:20](=[O:21])[NH:22][CH2:23][CH:24]1[CH2:25][CH2:26][NH:27][CH2:28][CH2:29]1', 'products': '[S:1](=[O:2])(=[O:3])([c:4]1[cH:5][cH:6][cH:7][c:8]2[cH:9][n:10][cH:11][c:12]([Cl:13])[c:14]12)[N:27]1[CH2:26][CH2:25][CH:24]([CH2:23][NH:22][C:20]([O:19][C:16]([CH3:15])([CH3:17])[CH3:18])=[O:21])[CH2:29][CH2:28]1'}\n",
"{'_id': 'US20090081165A1', 'reactants': 'Cl[c:1]1[cH:2][cH:3][c:4]([C:5]([O:6][CH2:7][CH3:8])=[O:9])[cH:10][c:11]1[N+:12](=[O:13])[O-:14].[NH2:15][C@H:16]1[CH2:17][CH2:18][C@H:19]([OH:20])[CH2:21][CH2:22]1', 'products': '[c:1]1([NH:15][C@H:16]2[CH2:17][CH2:18][C@H:19]([OH:20])[CH2:21][CH2:22]2)[cH:2][cH:3][c:4]([C:5]([O:6][CH2:7][CH3:8])=[O:9])[cH:10][c:11]1[N+:12](=[O:13])[O-:14]'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 41/40008 [00:00<13:37, 48.89it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US08404673B2', 'reactants': 'O=[C:1]1[CH2:2][CH2:3][N:4]([C:5]([CH3:6])=[O:7])[CH2:8][CH2:9]1.[CH3:10][O:11][c:12]1[cH:13][c:14]([C:15]([F:16])([F:17])[F:18])[cH:19][c:20]([C:21]([F:22])([F:23])[F:24])[c:25]1[C:26](=[O:27])[NH:28][C@@H:29]1[CH2:30][CH2:31][CH2:32][CH2:33][C@@H:34]1[NH2:35]', 'products': '[CH:1]1([NH:35][C@@H:34]2[C@H:29]([NH:28][C:26]([c:25]3[c:12]([O:11][CH3:10])[cH:13][c:14]([C:15]([F:16])([F:17])[F:18])[cH:19][c:20]3[C:21]([F:22])([F:23])[F:24])=[O:27])[CH2:30][CH2:31][CH2:32][CH2:33]2)[CH2:2][CH2:3][N:4]([C:5]([CH3:6])=[O:7])[CH2:8][CH2:9]1'}\n",
"{'_id': 'US06888000B2', 'reactants': 'Cl[S:1](=[O:2])(=[O:3])[c:4]1[cH:5][cH:6][cH:7][cH:8][cH:9]1.[CH3:10][CH2:11][CH2:12][CH2:13][CH2:14][c:15]1[n:16][c:17]2[c:18]([NH2:19])[n:20][c:21]3[cH:22][cH:23][cH:24][cH:25][c:26]3[c:27]2[n:28]1[CH2:29][CH2:30][CH2:31][CH2:32][NH2:33]', 'products': '[S:1](=[O:2])(=[O:3])([c:4]1[cH:5][cH:6][cH:7][cH:8][cH:9]1)[NH:33][CH2:32][CH2:31][CH2:30][CH2:29][n:28]1[c:15]([CH2:14][CH2:13][CH2:12][CH2:11][CH3:10])[n:16][c:17]2[c:18]([NH2:19])[n:20][c:21]3[cH:22][cH:23][cH:24][cH:25][c:26]3[c:27]21'}\n",
"{'_id': 'US07285558B2', 'reactants': 'O[C:1]([c:2]1[cH:3][c:4](-[c:5]2[c:6]3[c:7](=[O:8])[n:9]([CH3:10])[c:11](=[O:12])[n:13]([CH2:14][CH:15]([CH3:16])[CH3:17])[c:18]3[n:19][n:20]2[CH2:21][c:22]2[cH:23][nH:24][c:25]3[cH:26][cH:27][c:28]([Cl:29])[cH:30][c:31]23)[n:32]([CH3:33])[cH:34]1)=[O:35].[CH3:36][CH:37]([CH3:38])[NH:39][CH2:40][CH2:41][NH2:42]', 'products': '[C:1]([c:2]1[cH:3][c:4](-[c:5]2[c:6]3[c:7](=[O:8])[n:9]([CH3:10])[c:11](=[O:12])[n:13]([CH2:14][CH:15]([CH3:16])[CH3:17])[c:18]3[n:19][n:20]2[CH2:21][c:22]2[cH:23][nH:24][c:25]3[cH:26][cH:27][c:28]([Cl:29])[cH:30][c:31]23)[n:32]([CH3:33])[cH:34]1)(=[O:35])[NH:42][CH2:41][CH2:40][NH:39][CH:37]([CH3:36])[CH3:38]'}\n",
"{'_id': 'US04006232', 'reactants': 'Cl[CH2:1][CH2:2][CH2:3][N:4]1[CH2:5][CH2:6][N:7]([c:8]2[cH:9][cH:10][cH:11][cH:12][cH:13]2)[CH2:14][CH2:15]1.[O:24]=[C:25]1[NH:26][C:27](=[O:28])[C:29]([c:30]2[cH:31][cH:32][cH:33][cH:34][cH:35]2)([c:36]2[cH:37][cH:38][cH:39][cH:40][cH:41]2)[NH:42]1', 'products': '[CH2:1]([CH2:2][CH2:3][N:4]1[CH2:5][CH2:6][N:7]([c:8]2[cH:9][cH:10][cH:11][cH:12][cH:13]2)[CH2:14][CH2:15]1)[N:26]1[C:25](=[O:24])[NH:42][C:29]([c:30]2[cH:31][cH:32][cH:33][cH:34][cH:35]2)([c:36]2[cH:37][cH:38][cH:39][cH:40][cH:41]2)[C:27]1=[O:28]'}\n",
"{'_id': 'US04927956', 'reactants': 'Cl[C:1]([CH3:2])=[O:3].[CH3:4][c:5]1[cH:6][cH:7][cH:8][cH:9][c:10]1[OH:11]', 'products': '[C:1]([CH3:2])(=[O:3])[O:11][c:10]1[c:5]([CH3:4])[cH:6][cH:7][cH:8][cH:9]1'}\n",
"{'_id': 'US20100009970A1', 'reactants': 'CC(C)(C)OC(=O)O[C:6]([O:5][C:2]([CH3:1])([CH3:3])[CH3:4])=[O:7].[CH3:8][NH:9][C@H:10]1[CH2:11][CH2:12][C@@H:13]([c:14]2[cH:15][cH:16][c:17]([Cl:18])[c:19]([Cl:20])[cH:21]2)[c:22]2[cH:23][cH:24][c:25]([C:26](=[O:27])[O:28][CH3:29])[cH:30][c:31]21', 'products': '[CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[N:9]([CH3:8])[C@H:10]1[CH2:11][CH2:12][C@@H:13]([c:14]2[cH:15][cH:16][c:17]([Cl:18])[c:19]([Cl:20])[cH:21]2)[c:22]2[cH:23][cH:24][c:25]([C:26](=[O:27])[O:28][CH3:29])[cH:30][c:31]21'}\n",
"{'_id': 'US06153768', 'reactants': 'Cl[S:1]([c:2]1[cH:3][cH:4][c:5]([CH3:6])[cH:7][cH:8]1)(=[O:9])=[O:10].[CH3:11][O:12][CH2:13][O:14][c:15]1[cH:16][cH:17][c:18]([C:19]2([CH3:20])[CH2:21][O:22][c:23]3[cH:24][c:25]([O:26][CH2:27][O:28][CH3:29])[cH:30][cH:31][c:32]3[CH:33]2[CH2:34][CH2:35][CH2:36][CH2:37][CH2:38][CH2:39][CH2:40][CH2:41][OH:42])[cH:43][cH:44]1', 'products': '[S:1]([c:2]1[cH:3][cH:4][c:5]([CH3:6])[cH:7][cH:8]1)(=[O:9])(=[O:10])[O:42][CH2:41][CH2:40][CH2:39][CH2:38][CH2:37][CH2:36][CH2:35][CH2:34][CH:33]1[C:19]([c:18]2[cH:17][cH:16][c:15]([O:14][CH2:13][O:12][CH3:11])[cH:44][cH:43]2)([CH3:20])[CH2:21][O:22][c:23]2[cH:24][c:25]([O:26][CH2:27][O:28][CH3:29])[cH:30][cH:31][c:32]21'}\n",
"{'_id': 'US20090105305A1', 'reactants': 'O[C:1]([c:2]1[cH:3][cH:4][cH:5][c:6]([S:7]([NH:8][c:9]2[c:10]([CH3:11])[cH:12][cH:13][c:14]([C:15](=[O:16])[N:17]3[CH2:18][CH2:19][CH:20]([c:21]4[cH:22][cH:23][c:24]([C:25]#[N:26])[cH:27][cH:28]4)[CH2:29][CH2:30]3)[cH:31]2)(=[O:32])=[O:33])[cH:34]1)=[O:35].[CH3:36][NH:37][CH3:38]', 'products': '[C:1]([c:2]1[cH:3][cH:4][cH:5][c:6]([S:7]([NH:8][c:9]2[c:10]([CH3:11])[cH:12][cH:13][c:14]([C:15](=[O:16])[N:17]3[CH2:18][CH2:19][CH:20]([c:21]4[cH:22][cH:23][c:24]([C:25]#[N:26])[cH:27][cH:28]4)[CH2:29][CH2:30]3)[cH:31]2)(=[O:32])=[O:33])[cH:34]1)(=[O:35])[N:37]([CH3:36])[CH3:38]'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 47/40008 [00:01<13:31, 49.23it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20150018332A1', 'reactants': 'I[c:1]1[cH:2][nH:3][cH:4][n:5]1.OB(O)[c:6]1[cH:7][c:8]([C:9]([F:10])([F:11])[F:12])[cH:13][c:14]([C:15]([F:16])([F:17])[F:18])[cH:19]1', 'products': '[c:1]1(-[c:6]2[cH:7][c:8]([C:9]([F:10])([F:11])[F:12])[cH:13][c:14]([C:15]([F:16])([F:17])[F:18])[cH:19]2)[cH:2][nH:3][cH:4][n:5]1'}\n",
"{'_id': 'US05654299', 'reactants': 'Cl[C:1](=[O:2])[O:3][CH2:4][c:5]1[cH:6][cH:7][cH:8][cH:9][cH:10]1.[N:11]#[C:12][CH:13]1[CH2:14][CH2:15][NH:16][CH2:17][CH2:18]1', 'products': '[C:1](=[O:2])([O:3][CH2:4][c:5]1[cH:6][cH:7][cH:8][cH:9][cH:10]1)[N:16]1[CH2:15][CH2:14][CH:13]([C:12]#[N:11])[CH2:18][CH2:17]1'}\n",
"{'_id': 'US20120101079A1', 'reactants': 'Cl[c:1]1[n:2][c:3]([S:4][CH2:5][CH3:6])[c:7]([C:8](=[O:9])[NH:10][CH2:11][c:12]2[cH:13][cH:14][cH:15][c:16]([F:17])[cH:18]2)[c:19]([CH3:20])[cH:21]1.[OH:22][CH2:23][C@H:24]1[CH2:25][O:26][CH2:27][CH2:28][NH:29]1', 'products': '[c:1]1([N:29]2[C@@H:24]([CH2:23][OH:22])[CH2:25][O:26][CH2:27][CH2:28]2)[n:2][c:3]([S:4][CH2:5][CH3:6])[c:7]([C:8](=[O:9])[NH:10][CH2:11][c:12]2[cH:13][cH:14][cH:15][c:16]([F:17])[cH:18]2)[c:19]([CH3:20])[cH:21]1'}\n",
"{'_id': 'US20120095249A1', 'reactants': '[CH3:1][C:2]1([CH3:3])[C@:4]2([C:5](=[O:6])[NH:7][c:8]3[cH:9][cH:10][c:11]([Cl:12])[cH:13][c:14]3[C:15](=[O:16])[C:17]([F:18])([F:19])[F:20])[CH2:21][CH2:22][C@@:23]1([CH3:24])[C:25](=[O:26])[O:27]2.[CH:28]#[C:29][CH:30]1[CH2:31][CH2:32]1', 'products': '[CH3:1][C:2]1([CH3:3])[C@:4]2([C:5](=[O:6])[NH:7][c:8]3[cH:9][cH:10][c:11]([Cl:12])[cH:13][c:14]3[C@:15]([OH:16])([C:17]([F:18])([F:19])[F:20])[C:28]#[C:29][CH:30]3[CH2:31][CH2:32]3)[CH2:21][CH2:22][C@@:23]1([CH3:24])[C:25](=[O:26])[O:27]2'}\n",
"{'_id': 'US06492393B1', 'reactants': 'O=[N+:1]([O-])[c:2]1[c:3]([CH3:4])[cH:5][cH:6][c:7]([C:8]([CH3:9])([CH3:10])[CH3:11])[cH:12]1', 'products': '[NH2:1][c:2]1[c:3]([CH3:4])[cH:5][cH:6][c:7]([C:8]([CH3:9])([CH3:10])[CH3:11])[cH:12]1'}\n",
"{'_id': 'US20140235623A1', 'reactants': '[CH3:1][NH:2][c:3]1[cH:4][cH:5][cH:6][c:7]([NH2:8])[c:9]1[C:10]#[N:11].[O:12]=[C:13]=[N:14][C:15](=[O:16])[c:17]1[cH:18][cH:19][cH:20][cH:21][cH:22]1', 'products': '[CH3:1][NH:2][c:3]1[cH:4][cH:5][cH:6][c:7]([NH:8][C:13](=[O:12])[NH:14][C:15](=[O:16])[c:17]2[cH:18][cH:19][cH:20][cH:21][cH:22]2)[c:9]1[C:10]#[N:11]'}\n",
"{'_id': 'US20110207731A1', 'reactants': 'Cl[S:1](=[O:2])(=[O:3])[CH2:4][c:5]1[cH:6][cH:7][c:8]([F:9])[cH:10][cH:11]1.[CH3:12][C:13]1=[C:14]([C:15]#[N:16])[CH:17]([c:18]2[cH:19][cH:20][c:21]3[c:22]([cH:23]2)[c:24]([NH2:25])[n:26][n:27]3[C:28](=[O:29])[O:30][C:31]([CH3:32])([CH3:33])[CH3:34])[C:35]([C:36]#[N:37])=[C:38]([CH3:39])[NH:40]1', 'products': '[S:1](=[O:2])(=[O:3])([CH2:4][c:5]1[cH:6][cH:7][c:8]([F:9])[cH:10][cH:11]1)[NH:25][c:24]1[c:22]2[c:21]([cH:20][cH:19][c:18]([CH:17]3[C:14]([C:15]#[N:16])=[C:13]([CH3:12])[NH:40][C:38]([CH3:39])=[C:35]3[C:36]#[N:37])[cH:23]2)[n:27]([C:28](=[O:29])[O:30][C:31]([CH3:32])([CH3:33])[CH3:34])[n:26]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 54/40008 [00:01<12:44, 52.25it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20090054398A1', 'reactants': 'O[C:1]([C@@H:2]([NH:3][C:4](=[O:5])[CH2:6][c:7]1[cH:8][c:9]([F:10])[cH:11][c:12]([F:13])[cH:14]1)[CH2:15][c:16]1[cH:17][c:18]2[cH:19][cH:20][cH:21][cH:22][c:23]2[nH:24]1)=[O:25].[NH2:26][C@H:27]1[C:28](=[O:29])[NH:30][c:31]2[cH:32][cH:33][cH:34][cH:35][c:36]2[S:37][C@H:38]1[c:39]1[cH:40][cH:41][cH:42][cH:43][cH:44]1', 'products': '[C:1]([C@@H:2]([NH:3][C:4](=[O:5])[CH2:6][c:7]1[cH:8][c:9]([F:10])[cH:11][c:12]([F:13])[cH:14]1)[CH2:15][c:16]1[cH:17][c:18]2[cH:19][cH:20][cH:21][cH:22][c:23]2[nH:24]1)(=[O:25])[NH:26][C@@H:27]1[C:28](=[O:29])[NH:30][c:31]2[cH:32][cH:33][cH:34][cH:35][c:36]2[S:37][C@@H:38]1[c:39]1[cH:40][cH:41][cH:42][cH:43][cH:44]1'}\n",
"{'_id': 'US20140309193A1', 'reactants': 'CCO[C:1](=[O:2])[c:3]1[c:4]([O:5][CH2:6][c:7]2[cH:8][cH:9][cH:10][cH:11][cH:12]2)[c:13]([O:14][CH2:15][c:16]2[cH:17][cH:18][cH:19][cH:20][cH:21]2)[c:22]([C:23](=[O:24])[N:25]([CH3:26])[CH3:27])[n:28]1-[c:29]1[cH:30][cH:31][c:32]([O:33][CH3:34])[cH:35][cH:36]1.[CH3:37][CH2:38][NH2:39]', 'products': '[C:1](=[O:2])([c:3]1[c:4]([O:5][CH2:6][c:7]2[cH:8][cH:9][cH:10][cH:11][cH:12]2)[c:13]([O:14][CH2:15][c:16]2[cH:17][cH:18][cH:19][cH:20][cH:21]2)[c:22]([C:23](=[O:24])[N:25]([CH3:26])[CH3:27])[n:28]1-[c:29]1[cH:30][cH:31][c:32]([O:33][CH3:34])[cH:35][cH:36]1)[NH:39][CH2:38][CH3:37]'}\n",
"{'_id': 'US20080221096A1', 'reactants': 'Cl[c:1]1[c:2]2[o:3][c:4]3[n:5][c:6]([N:7]([CH3:8])[CH2:9][c:10]4[cH:11][cH:12][cH:13][cH:14][cH:15]4)[c:16]4[c:17]([c:18]3[c:19]2[n:20][cH:21][n:22]1)[CH2:23][C:24]([CH3:25])([CH3:26])[O:27][CH2:28]4.[NH2:29][CH2:30][CH2:31][N:32]1[CH2:33][CH2:34][O:35][CH2:36][CH2:37]1', 'products': '[c:1]1([NH:29][CH2:30][CH2:31][N:32]2[CH2:33][CH2:34][O:35][CH2:36][CH2:37]2)[c:2]2[o:3][c:4]3[n:5][c:6]([N:7]([CH3:8])[CH2:9][c:10]4[cH:11][cH:12][cH:13][cH:14][cH:15]4)[c:16]4[c:17]([c:18]3[c:19]2[n:20][cH:21][n:22]1)[CH2:23][C:24]([CH3:25])([CH3:26])[O:27][CH2:28]4'}\n",
"{'_id': 'US07820704B2', 'reactants': 'Cl[c:1]1[n:2][cH:3][cH:4][c:5]([NH:6][CH:7]2[CH2:8][CH2:9][CH2:10][CH2:11]2)[n:12]1.OB(O)[c:13]1[cH:14][cH:15][c:16]([O:17][CH2:18][c:19]2[cH:20][cH:21][cH:22][cH:23][cH:24]2)[cH:25][cH:26]1', 'products': '[c:1]1(-[c:13]2[cH:14][cH:15][c:16]([O:17][CH2:18][c:19]3[cH:20][cH:21][cH:22][cH:23][cH:24]3)[cH:25][cH:26]2)[n:2][cH:3][cH:4][c:5]([NH:6][CH:7]2[CH2:8][CH2:9][CH2:10][CH2:11]2)[n:12]1'}\n",
"{'_id': 'US20090221555A1', 'reactants': 'Cl[c:1]1[n:2][c:3]([NH:4][c:5]2[c:6]([C:7]([NH:8][CH2:9][C:10]#[CH:11])=[O:12])[cH:13][cH:14][cH:15][c:16]2[F:17])[c:18]([Cl:19])[cH:20][n:21]1.[CH3:22][CH2:23][N:24]1[CH2:25][C:26]([OH:27])([CH2:28][OH:29])[CH2:30][O:31][c:32]2[cH:33][c:34]([NH2:35])[cH:36][cH:37][c:38]21', 'products': '[c:1]1([NH:35][c:34]2[cH:33][c:32]3[c:38]([cH:37][cH:36]2)[N:24]([CH2:23][CH3:22])[CH2:25][C:26]([OH:27])([CH2:28][OH:29])[CH2:30][O:31]3)[n:2][c:3]([NH:4][c:5]2[c:6]([C:7]([NH:8][CH2:9][C:10]#[CH:11])=[O:12])[cH:13][cH:14][cH:15][c:16]2[F:17])[c:18]([Cl:19])[cH:20][n:21]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 60/40008 [00:01<13:14, 50.27it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US08003677B2', 'reactants': 'O[C:1](=[O:2])[c:3]1[cH:4][cH:5][c:6]([N:7]2[CH2:8][C:9]3=[C:10]([CH2:11][N:12]([C:13](=[O:14])[c:15]4[cH:16][cH:17][cH:18][cH:19][c:20]4[C:21]([F:22])([F:23])[F:24])[CH2:25]3)[CH2:26]2)[n:27][cH:28]1.[NH2:29][CH2:30][CH2:31][CH:32]1[CH2:33][CH2:34]1', 'products': '[C:1](=[O:2])([c:3]1[cH:4][cH:5][c:6]([N:7]2[CH2:8][C:9]3=[C:10]([CH2:11][N:12]([C:13](=[O:14])[c:15]4[cH:16][cH:17][cH:18][cH:19][c:20]4[C:21]([F:22])([F:23])[F:24])[CH2:25]3)[CH2:26]2)[n:27][cH:28]1)[NH:29][CH2:30][CH2:31][CH:32]1[CH2:33][CH2:34]1'}\n",
"{'_id': 'US20130143892A1', 'reactants': 'O[C:1]([c:2]1[cH:3][c:4](-[c:5]2[cH:6][cH:7][c:8]([C:9]#[N:10])[cH:11][c:12]2[F:13])[n:14][o:15]1)=[O:16].[CH3:17][C:18]([CH3:19])([CH3:20])[c:21]1[n:22][o:23][c:24]([N:25]2[CH2:26][CH2:27][CH:28]([NH:29][CH:30]3[CH2:31][CH2:32]3)[CH2:33][CH2:34]2)[n:35]1', 'products': '[C:1]([c:2]1[cH:3][c:4](-[c:5]2[cH:6][cH:7][c:8]([C:9]#[N:10])[cH:11][c:12]2[F:13])[n:14][o:15]1)(=[O:16])[N:29]([CH:28]1[CH2:27][CH2:26][N:25]([c:24]2[o:23][n:22][c:21]([C:18]([CH3:17])([CH3:19])[CH3:20])[n:35]2)[CH2:34][CH2:33]1)[CH:30]1[CH2:31][CH2:32]1'}\n",
"{'_id': 'US20090123563A1', 'reactants': 'O=C1CCC(=O)N1[Br:1].[CH3:2][c:3]1[cH:4][cH:5][c:6]([C:7](=[O:8])[OH:9])[cH:10][c:11]1[F:12]', 'products': '[Br:1][CH2:2][c:3]1[cH:4][cH:5][c:6]([C:7](=[O:8])[OH:9])[cH:10][c:11]1[F:12]'}\n",
"{'_id': 'US20090105305A1', 'reactants': 'C[O:1][C:2](=[O:3])[CH2:4][S:5](=[O:6])(=[O:7])[NH:8][c:9]1[cH:10][c:11]([C:12](=[O:13])[N:14]2[CH2:15][CH2:16][CH:17]([c:18]3[cH:19][cH:20][c:21]([C:22]#[N:23])[cH:24][cH:25]3)[CH2:26][CH2:27]2)[cH:28][cH:29][c:30]1[CH3:31]', 'products': '[OH:1][C:2](=[O:3])[CH2:4][S:5](=[O:6])(=[O:7])[NH:8][c:9]1[cH:10][c:11]([C:12](=[O:13])[N:14]2[CH2:15][CH2:16][CH:17]([c:18]3[cH:19][cH:20][c:21]([C:22]#[N:23])[cH:24][cH:25]3)[CH2:26][CH2:27]2)[cH:28][cH:29][c:30]1[CH3:31]'}\n",
"{'_id': 'US07910592B2', 'reactants': 'O=S(=O)(O[C:1]1=[CH:2][CH2:3][C:4]2([CH2:5][CH2:6]1)[O:7][CH2:8][CH2:9][O:10]2)C(F)(F)F.OB(O)[c:11]1[cH:12][cH:13][c:14]([O:15][CH3:16])[cH:17][cH:18]1', 'products': '[C:1]1([c:11]2[cH:12][cH:13][c:14]([O:15][CH3:16])[cH:17][cH:18]2)=[CH:2][CH2:3][C:4]2([CH2:5][CH2:6]1)[O:7][CH2:8][CH2:9][O:10]2'}\n",
"{'_id': 'US20070066624A1', 'reactants': '[CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[N:8]1[CH2:9][CH2:10][CH:11]([N:12]2[C:13](=[O:14])[NH:15][CH2:16][C@H:17]2[c:18]2[cH:19][cH:20][cH:21][cH:22][cH:23]2)[CH2:24][CH2:25]1.[CH3:26][N:27]=[C:28]=[O:29]', 'products': '[CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[N:8]1[CH2:9][CH2:10][CH:11]([N:12]2[C:13](=[O:14])[N:15]([C:28]([NH:27][CH3:26])=[O:29])[CH2:16][C@H:17]2[c:18]2[cH:19][cH:20][cH:21][cH:22][cH:23]2)[CH2:24][CH2:25]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 67/40008 [00:01<12:17, 54.19it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US07598393B2', 'reactants': 'Br[c:1]1[c:2]([N+:3](=[O:4])[O-:5])[cH:6][c:7]([Br:8])[cH:9][cH:10]1.[NH2:11][CH2:12][C:13]1([OH:14])[CH2:15][CH2:16][CH2:17][CH2:18]1', 'products': '[c:1]1([NH:11][CH2:12][C:13]2([OH:14])[CH2:15][CH2:16][CH2:17][CH2:18]2)[c:2]([N+:3](=[O:4])[O-:5])[cH:6][c:7]([Br:8])[cH:9][cH:10]1'}\n",
"{'_id': 'US20020192594A1', 'reactants': 'Cl[C:1]([CH:2]=[CH2:3])=[O:4].[OH:5][CH2:6][C:7]([Cl:8])([Cl:9])[Cl:10]', 'products': '[C:1]([CH:2]=[CH2:3])(=[O:4])[O:5][CH2:6][C:7]([Cl:8])([Cl:9])[Cl:10]'}\n",
"{'_id': 'US06599922B2', 'reactants': 'FC(F)(F)CO[C:1](=[O:2])[c:3]1[cH:4][c:5]([O:6][CH2:7][C:8]([F:9])([F:10])[F:11])[cH:12][cH:13][c:14]1[O:15][CH2:16][C:17]([F:18])([F:19])[F:20].[NH2:21][CH2:22][CH:23]1[CH2:24][CH2:25][CH2:26][CH2:27][NH:28]1', 'products': '[C:1](=[O:2])([c:3]1[cH:4][c:5]([O:6][CH2:7][C:8]([F:9])([F:10])[F:11])[cH:12][cH:13][c:14]1[O:15][CH2:16][C:17]([F:18])([F:19])[F:20])[NH:21][CH2:22][CH:23]1[CH2:24][CH2:25][CH2:26][CH2:27][NH:28]1'}\n",
"{'_id': 'US20150218170A1', 'reactants': 'Br[c:1]1[c:2]([CH3:3])[n:4][c:5]2[c:6]([n:7]1)[c:8]([C:9](=[O:10])[NH:11][C:12]([CH3:13])([CH3:14])[CH3:15])[cH:16][n:17]2[CH2:18][O:19][CH2:20][CH2:21][Si:22]([CH3:23])([CH3:24])[CH3:25].[CH3:26][n:27]1[cH:28][c:29]([NH2:30])[cH:31][n:32]1', 'products': '[c:1]1([NH:30][c:29]2[cH:28][n:27]([CH3:26])[n:32][cH:31]2)[c:2]([CH3:3])[n:4][c:5]2[c:6]([n:7]1)[c:8]([C:9](=[O:10])[NH:11][C:12]([CH3:13])([CH3:14])[CH3:15])[cH:16][n:17]2[CH2:18][O:19][CH2:20][CH2:21][Si:22]([CH3:23])([CH3:24])[CH3:25]'}\n",
"{'_id': 'US20150259351A1', 'reactants': 'F[c:1]1[n:2][cH:3][cH:4][cH:5][c:6]1[I:7].[CH3:15][C@@H:16]([NH3+:17])[CH:18]1[CH2:19][CH2:20][O:21][CH2:22][CH2:23]1', 'products': '[c:1]1([NH:17][C@H:16]([CH3:15])[CH:18]2[CH2:19][CH2:20][O:21][CH2:22][CH2:23]2)[n:2][cH:3][cH:4][cH:5][c:6]1[I:7]'}\n",
"{'_id': 'US20120108574A1', 'reactants': 'Cl[c:1]1[c:2](-[c:3]2[cH:4][cH:5][c:6]([C:7]3([NH:8][C:9]([O:10][C:11]([CH3:12])([CH3:13])[CH3:14])=[O:15])[CH2:16][CH2:17][CH2:18]3)[cH:19][cH:20]2)[n:21]2[c:22]([n:23]1)-[c:24]1[c:25]([cH:26][cH:27][cH:28][cH:29]1)[NH:30][c:31]1[c:32]-2[cH:33][cH:34][cH:35][n:36]1.OB(O)[c:37]1[cH:38][cH:39][c:40]([C:41](=[O:42])[O:43][CH2:44][c:45]2[cH:46][cH:47][cH:48][cH:49][cH:50]2)[cH:51][cH:52]1', 'products': '[c:1]1(-[c:37]2[cH:38][cH:39][c:40]([C:41](=[O:42])[O:43][CH2:44][c:45]3[cH:46][cH:47][cH:48][cH:49][cH:50]3)[cH:51][cH:52]2)[c:2](-[c:3]2[cH:4][cH:5][c:6]([C:7]3([NH:8][C:9]([O:10][C:11]([CH3:12])([CH3:13])[CH3:14])=[O:15])[CH2:16][CH2:17][CH2:18]3)[cH:19][cH:20]2)[n:21]2[c:22]([n:23]1)-[c:24]1[c:25]([cH:26][cH:27][cH:28][cH:29]1)[NH:30][c:31]1[c:32]-2[cH:33][cH:34][cH:35][n:36]1'}\n",
"{'_id': 'US07897602B2', 'reactants': 'CC[O:1][C:2](=[O:3])[CH2:4][CH2:5][c:6]1[c:7](/[CH:8]=[C:9]2\\\\[C:10](=[O:11])[NH:12][c:13]3[cH:14][cH:15][cH:16][cH:17][c:18]32)[nH:19][c:20]2[c:21]1[C:22](=[O:23])[CH2:24][CH2:25][CH2:26]2', 'products': '[OH:1][C:2](=[O:3])[CH2:4][CH2:5][c:6]1[c:7](/[CH:8]=[C:9]2\\\\[C:10](=[O:11])[NH:12][c:13]3[cH:14][cH:15][cH:16][cH:17][c:18]32)[nH:19][c:20]2[c:21]1[C:22](=[O:23])[CH2:24][CH2:25][CH2:26]2'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 73/40008 [00:01<12:08, 54.84it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20070060584A1', 'reactants': 'CO[C:1](=[O:2])[c:3]1[n:4][cH:5][c:6](-[c:7]2[cH:8][cH:9][cH:10][c:11](-[c:12]3[cH:13][cH:14][cH:15][cH:16][c:17]3[C:18]([F:19])([F:20])[F:21])[cH:22]2)[nH:23]1.[NH3:24]', 'products': '[C:1](=[O:2])([c:3]1[n:4][cH:5][c:6](-[c:7]2[cH:8][cH:9][cH:10][c:11](-[c:12]3[cH:13][cH:14][cH:15][cH:16][c:17]3[C:18]([F:19])([F:20])[F:21])[cH:22]2)[nH:23]1)[NH2:24]'}\n",
"{'_id': 'US06255315B1', 'reactants': '[CH:1]1=[C:2]([c:3]2[cH:4][cH:5][cH:6][cH:7][n:8]2)[CH2:9][CH2:10][C:11]2([CH2:12]1)[O:13][CH2:14][CH2:15][O:16]2', 'products': '[CH2:1]1[CH:2]([c:3]2[cH:4][cH:5][cH:6][cH:7][n:8]2)[CH2:9][CH2:10][C:11]2([CH2:12]1)[O:13][CH2:14][CH2:15][O:16]2'}\n",
"{'_id': 'US09067921B2', 'reactants': 'CC(C)(C)OC(=O)[N:1]1[CH2:2][C@H:3]([CH3:4])[N:5]([CH:6]2[CH2:7][CH2:8][c:9]3[cH:10][cH:11][c:12]([C:13]([F:14])([F:15])[F:16])[cH:17][c:18]32)[CH2:19][CH2:20]1', 'products': '[NH:1]1[CH2:2][C@H:3]([CH3:4])[N:5]([CH:6]2[CH2:7][CH2:8][c:9]3[cH:10][cH:11][c:12]([C:13]([F:14])([F:15])[F:16])[cH:17][c:18]32)[CH2:19][CH2:20]1'}\n",
"{'_id': 'US20140113898A1', 'reactants': 'Br[CH2:1][CH2:2][Br:3].[CH3:4][O:5][C:6](=[O:7])[c:8]1[cH:9][c:10]([C:11]([F:12])([F:13])[F:14])[n:15][nH:16]1', 'products': '[CH2:1]([CH2:2][Br:3])[n:16]1[c:8]([C:6]([O:5][CH3:4])=[O:7])[cH:9][c:10]([C:11]([F:12])([F:13])[F:14])[n:15]1'}\n",
"{'_id': 'US06329405B1', 'reactants': 'O=[N+:1]([O-])[c:2]1[cH:3][cH:4][cH:5][c:6]([CH:7]([NH:8][CH:9]([CH3:10])[c:11]2[cH:12][cH:13][c:14]([F:15])[c:16]([F:17])[cH:18]2)[c:19]2[cH:20][cH:21][c:22]([F:23])[cH:24][cH:25]2)[cH:26]1', 'products': '[NH2:1][c:2]1[cH:3][cH:4][cH:5][c:6]([CH:7]([NH:8][CH:9]([CH3:10])[c:11]2[cH:12][cH:13][c:14]([F:15])[c:16]([F:17])[cH:18]2)[c:19]2[cH:20][cH:21][c:22]([F:23])[cH:24][cH:25]2)[cH:26]1'}\n",
"{'_id': 'US07410972B2', 'reactants': 'Cl[S:1](=[O:2])(=[O:3])[c:4]1[cH:5][cH:6][c:7]([Br:8])[s:9]1.[CH3:10][O:11][c:12]1[n:13][c:14]([Cl:15])[cH:16][n:17][c:18]1[NH2:19]', 'products': '[S:1](=[O:2])(=[O:3])([c:4]1[cH:5][cH:6][c:7]([Br:8])[s:9]1)[NH:19][c:18]1[c:12]([O:11][CH3:10])[n:13][c:14]([Cl:15])[cH:16][n:17]1'}\n",
"{'_id': 'US05011992', 'reactants': 'Br[CH2:1][C:2]([CH2:3][CH3:4])=[O:5].[CH3:6][CH2:7][N:8]([CH2:9][CH3:10])[CH2:11][CH2:12][NH:13][C:14](=[O:15])[c:16]1[cH:17][c:18]([Cl:19])[c:20]([NH2:21])[cH:22][c:23]1[OH:24]', 'products': '[CH2:1]([C:2]([CH2:3][CH3:4])=[O:5])[O:24][c:23]1[c:16]([C:14]([NH:13][CH2:12][CH2:11][N:8]([CH2:7][CH3:6])[CH2:9][CH3:10])=[O:15])[cH:17][c:18]([Cl:19])[c:20]([NH2:21])[cH:22]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 80/40008 [00:01<11:41, 56.94it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US08791102B2', 'reactants': 'O[C:1](=[O:2])[CH2:3][c:4]1[cH:5][n:6][cH:7][cH:8][n:9]1.[NH2:10][C@@H:11]([CH2:12][O:13][CH2:14][c:15]1[cH:16][cH:17][cH:18][cH:19][cH:20]1)[C:21](=[O:22])[NH:23][c:24]1[cH:25][cH:26][c:27]([O:28][c:29]2[cH:30][cH:31][c:32]([F:33])[cH:34][cH:35]2)[cH:36][cH:37]1', 'products': '[C:1](=[O:2])([CH2:3][c:4]1[cH:5][n:6][cH:7][cH:8][n:9]1)[NH:10][C@@H:11]([CH2:12][O:13][CH2:14][c:15]1[cH:16][cH:17][cH:18][cH:19][cH:20]1)[C:21](=[O:22])[NH:23][c:24]1[cH:25][cH:26][c:27]([O:28][c:29]2[cH:30][cH:31][c:32]([F:33])[cH:34][cH:35]2)[cH:36][cH:37]1'}\n",
"{'_id': 'US04233318', 'reactants': '[CH3:1][CH2:2][N:3]=[C:4]=[S:5].[NH2:6][CH2:7][c:8]1[cH:9][cH:10][cH:11][cH:12][cH:13]1', 'products': '[CH3:1][CH2:2][NH:3][C:4](=[S:5])[NH:6][CH2:7][c:8]1[cH:9][cH:10][cH:11][cH:12][cH:13]1'}\n",
"{'_id': 'US08058441B2', 'reactants': '[CH2:1]=[CH:2][C:3](=[O:4])[NH:5][c:6]1[cH:7][cH:8][c:9]2[c:10]([cH:11]1)[C:12]([CH3:13])([c:14]1[cH:15][cH:16][cH:17][cH:18][cH:19]1)[CH2:20][C:21]([CH3:22])([CH3:23])[N:24]2[C:25]([CH3:26])=[O:27].[CH3:28][CH2:29][O:30][CH2:31][CH2:32][OH:33]', 'products': '[CH2:1]([CH2:2][C:3](=[O:4])[NH:5][c:6]1[cH:7][cH:8][c:9]2[c:10]([cH:11]1)[C:12]([CH3:13])([c:14]1[cH:15][cH:16][cH:17][cH:18][cH:19]1)[CH2:20][C:21]([CH3:22])([CH3:23])[N:24]2[C:25]([CH3:26])=[O:27])[O:33][CH2:32][CH2:31][O:30][CH2:29][CH3:28]'}\n",
"{'_id': 'US20100120864A1', 'reactants': 'CO[C:1](=[O:2])[CH:3]([NH2:4])[c:5]1[cH:6][cH:7][cH:8][c:9]([NH:10][CH:11]2[CH2:12][CH2:13][CH2:14][CH2:15][CH2:16]2)[cH:17]1', 'products': '[CH2:1]([OH:2])[CH:3]([NH2:4])[c:5]1[cH:6][cH:7][cH:8][c:9]([NH:10][CH:11]2[CH2:12][CH2:13][CH2:14][CH2:15][CH2:16]2)[cH:17]1'}\n",
"{'_id': 'US20100204196A1', 'reactants': 'O=[CH2:1].[CH3:2][O:3][c:4]1[cH:5][cH:6][c:7]([NH2:8])[c:9]([CH3:10])[cH:11]1', 'products': '[CH3:1][NH:8][c:7]1[cH:6][cH:5][c:4]([O:3][CH3:2])[cH:11][c:9]1[CH3:10]'}\n",
"{'_id': 'US04665079', 'reactants': 'Cl[c:1]1[c:2]([F:3])[cH:4][c:5]2[c:6](=[O:7])[c:8]([C:9](=[O:10])[OH:11])[cH:12][n:13]([CH2:14][CH3:15])[c:16]2[n:17]1.[CH3:18][CH2:19][NH:20][CH2:21][CH:22]1[CH2:23][CH2:24][NH:25][CH2:26]1', 'products': '[c:1]1([N:25]2[CH2:24][CH2:23][CH:22]([CH2:21][NH:20][CH2:19][CH3:18])[CH2:26]2)[c:2]([F:3])[cH:4][c:5]2[c:6](=[O:7])[c:8]([C:9](=[O:10])[OH:11])[cH:12][n:13]([CH2:14][CH3:15])[c:16]2[n:17]1'}\n",
"{'_id': 'US20100029655A1', 'reactants': 'Cl[CH2:1][c:2]1[cH:3][c:4](-[c:5]2[cH:6][cH:7][c:8]([CH2:9][N:10]3[CH2:11][CH2:12][N:13]([CH3:14])[CH2:15][CH2:16]3)[cH:17][cH:18]2)[n:19][o:20]1.[F:21][c:22]1[cH:23][cH:24][cH:25][c:26](-[c:27]2[n:28][c:29]3[cH:30][n:31][nH:32][cH:33][c:34]-3[n:35]2)[c:36]1[F:37]', 'products': '[CH2:1]([c:2]1[cH:3][c:4](-[c:5]2[cH:6][cH:7][c:8]([CH2:9][N:10]3[CH2:11][CH2:12][N:13]([CH3:14])[CH2:15][CH2:16]3)[cH:17][cH:18]2)[n:19][o:20]1)[n:32]1[n:31][cH:30][c:29]2[n:28][c:27](-[c:26]3[cH:25][cH:24][cH:23][c:22]([F:21])[c:36]3[F:37])[n:35][c:34]-2[cH:33]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 87/40008 [00:01<11:04, 60.11it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20100197908A1', 'reactants': 'IC[I:1].N[c:2]1[s:3][cH:4][c:5]([C:6]([O:7][CH2:8][CH3:9])=[O:10])[n:11]1', 'products': '[I:1][c:2]1[s:3][cH:4][c:5]([C:6]([O:7][CH2:8][CH3:9])=[O:10])[n:11]1'}\n",
"{'_id': 'US20090093459A1', 'reactants': 'O[C:1]([C@@H:2]([NH:3][O:4][C:5]([C:6]([CH3:7])([CH3:8])[CH3:9])=[O:10])[CH2:11][S:12][c:13]1[c:14]([NH2:15])[cH:16][cH:17][cH:18][c:19]1-[c:20]1[cH:21][cH:22][cH:23][cH:24][cH:25]1)=[O:26]', 'products': '[C:1]1(=[O:26])[C@@H:2]([NH:3][O:4][C:5]([C:6]([CH3:7])([CH3:8])[CH3:9])=[O:10])[CH2:11][S:12][c:13]2[c:14]([cH:16][cH:17][cH:18][c:19]2-[c:20]2[cH:21][cH:22][cH:23][cH:24][cH:25]2)[NH:15]1'}\n",
"{'_id': 'US20090118284A1', 'reactants': 'Br[c:1]1[c:2]2[c:3]([cH:4][cH:5][c:6]([N+:7](=[O:8])[O-:9])[cH:10]2)[n:11]([C:12]([c:13]2[cH:14][cH:15][cH:16][cH:17][cH:18]2)([c:19]2[cH:20][cH:21][cH:22][cH:23][cH:24]2)[c:25]2[cH:26][cH:27][cH:28][cH:29][cH:30]2)[n:31]1.CC1(C)OB([c:32]2[cH:33][n:34][nH:35][cH:36]2)OC1(C)C', 'products': '[c:1]1(-[c:32]2[cH:33][n:34][nH:35][cH:36]2)[c:2]2[c:3]([cH:4][cH:5][c:6]([N+:7](=[O:8])[O-:9])[cH:10]2)[n:11]([C:12]([c:13]2[cH:14][cH:15][cH:16][cH:17][cH:18]2)([c:19]2[cH:20][cH:21][cH:22][cH:23][cH:24]2)[c:25]2[cH:26][cH:27][cH:28][cH:29][cH:30]2)[n:31]1'}\n",
"{'_id': 'US20130331561A1', 'reactants': 'CC(C)(C)[Si](C)(C)[O:1][CH2:2][C@@H:3]1[C@@H:4]([O:5][C:6](=[O:7])[O:8][CH2:9][c:10]2[cH:11][cH:12][cH:13][cH:14][cH:15]2)[CH2:16][C@H:17]([n:18]2[cH:19][c:20]3[cH:21][c:22](-[c:23]4[cH:24][cH:25][c:26]([CH2:27][CH2:28][CH2:29][CH2:30][CH3:31])[cH:32][cH:33]4)[o:34][c:35]3[n:36][c:37]2=[O:38])[O:39]1', 'products': '[OH:1][CH2:2][C@@H:3]1[C@@H:4]([O:5][C:6](=[O:7])[O:8][CH2:9][c:10]2[cH:11][cH:12][cH:13][cH:14][cH:15]2)[CH2:16][C@H:17]([n:18]2[cH:19][c:20]3[cH:21][c:22](-[c:23]4[cH:24][cH:25][c:26]([CH2:27][CH2:28][CH2:29][CH2:30][CH3:31])[cH:32][cH:33]4)[o:34][c:35]3[n:36][c:37]2=[O:38])[O:39]1'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 94/40008 [00:01<11:42, 56.79it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'US20140088306A1', 'reactants': 'CC(C)(C)[Si](C)(C)[O:1][c:2]1[cH:3][cH:4][c:5]2[n:6][cH:7][c:8](-[c:9]3[cH:10][cH:11][c:12]([N:13]([CH3:14])[CH3:15])[cH:16][cH:17]3)[n:18][c:19]2[cH:20]1', 'products': '[OH:1][c:2]1[cH:3][cH:4][c:5]2[n:6][cH:7][c:8](-[c:9]3[cH:10][cH:11][c:12]([N:13]([CH3:14])[CH3:15])[cH:16][cH:17]3)[n:18][c:19]2[cH:20]1'}\n",
"{'_id': 'US20070155805A1', 'reactants': 'C[O:1][C:2](=[O:3])[c:4]1[s:5][c:6]2[cH:7][c:8]([C:9]([F:10])([F:11])[F:12])[cH:13][cH:14][c:15]2[c:16]1[CH3:17]', 'products': '[OH:1][C:2](=[O:3])[c:4]1[s:5][c:6]2[cH:7][c:8]([C:9]([F:10])([F:11])[F:12])[cH:13][cH:14][c:15]2[c:16]1[CH3:17]'}\n",
"{'_id': 'US20140336106A1', 'reactants': 'CC(C)(C)OC(=O)[NH:1][C@H:2]([C:3]([NH:4][C@@H:5]([C@@H:6]([c:7]1[cH:8][cH:9][c:10]([O:11][CH3:12])[cH:13][cH:14]1)[OH:15])[C:16](=[O:17])[O:18][CH2:19][c:20]1[cH:21][cH:22][cH:23][cH:24][cH:25]1)=[O:26])[CH2:27][OH:28]', 'products': '[NH2:1][C@H:2]([C:3]([NH:4][C@@H:5]([C@@H:6]([c:7]1[cH:8][cH:9][c:10]([O:11][CH3:12])[cH:13][cH:14]1)[OH:15])[C:16](=[O:17])[O:18][CH2:19][c:20]1[cH:21][cH:22][cH:23][cH:24][cH:25]1)=[O:26])[CH2:27][OH:28]'}\n",
"{'_id': 'US03987160', 'reactants': 'O=[N:1][c:2]1[c:3]([NH2:4])[n:5][c:6](-[c:7]2[c:8]([O:9][CH2:10][CH2:11][CH3:12])[cH:13][cH:14][c:15]([CH3:16])[cH:17]2)[nH:18][c:19]1=[O:20]', 'products': '[NH2:1][c:2]1[c:3]([NH2:4])[n:5][c:6](-[c:7]2[c:8]([O:9][CH2:10][CH2:11][CH3:12])[cH:13][cH:14][c:15]([CH3:16])[cH:17]2)[nH:18][c:19]1=[O:20]'}\n",
"{'_id': 'US20150099777A1', 'reactants': 'C[O:1][C:2]([c:3]1[c:4]([CH2:5][CH2:6][CH3:7])[n:8][c:9]2[c:10]([cH:11]1)[C:12](=[O:13])[N:14]([CH:15]1[CH2:16][CH2:17][N:18]([CH2:19][c:20]3[cH:21][c:22]([CH:23]4[CH2:24][CH2:25]4)[c:26](-[c:27]4[cH:28][cH:29][c:30]([F:31])[cH:32][c:33]4[F:34])[c:35]([F:36])[c:37]3[O:38][CH3:39])[CH2:40][CH2:41]1)[CH2:42][CH2:43]2)=[O:44]', 'products': '[OH:1][C:2]([c:3]1[c:4]([CH2:5][CH2:6][CH3:7])[n:8][c:9]2[c:10]([cH:11]1)[C:12](=[O:13])[N:14]([CH:15]1[CH2:16][CH2:17][N:18]([CH2:19][c:20]3[cH:21][c:22]([CH:23]4[CH2:24][CH2:25]4)[c:26](-[c:27]4[cH:28][cH:29][c:30]([F:31])[cH:32][c:33]4[F:34])[c:35]([F:36])[c:37]3[O:38][CH3:39])[CH2:40][CH2:41]1)[CH2:42][CH2:43]2)=[O:44]'}\n",
"{'_id': 'US20130336921A1', 'reactants': 'Cl[C:1](=[O:2])[c:3]1[cH:4][c:5]([Cl:6])[cH:7][cH:8][c:9]1[N+:10](=[O:11])[O-:12].[CH3:13][c:14]1[cH:15][cH:16][c:17](-[n:18]2[cH:19][cH:20][c:21]([NH2:22])[n:23]2)[cH:24][c:25]1[CH3:26]', 'products': '[C:1](=[O:2])([c:3]1[cH:4][c:5]([Cl:6])[cH:7][cH:8][c:9]1[N+:10](=[O:11])[O-:12])[NH:22][c:21]1[cH:20][cH:19][n:18](-[c:17]2[cH:16][cH:15][c:14]([CH3:13])[c:25]([CH3:26])[cH:24]2)[n:23]1'}\n",
"Interrupted\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_46290/3887948522.py\u001b[0m in \u001b[0;36mextract\u001b[0;34m(reaction)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreaction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtemplate_extractor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract_from_reaction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreaction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/rdchiral/template_extractor.py\u001b[0m in \u001b[0;36mextract_from_reaction\u001b[0;34m(reaction)\u001b[0m\n\u001b[1;32m 777\u001b[0m \u001b[0;31m# Get fragments for reactants\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 778\u001b[0;31m reactant_fragments, intra_only, dimer_only = get_fragments_for_changed_atoms(reactants, changed_atom_tags, \n\u001b[0m\u001b[1;32m 779\u001b[0m radius = 1, expansion = [], category = 'reactants')\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/rdchiral/template_extractor.py\u001b[0m in \u001b[0;36mget_fragments_for_changed_atoms\u001b[0;34m(mols, changed_atom_tags, radius, category, expansion)\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0matom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSetIsotope\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0matom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetProp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'molAtomMapNumber'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 593\u001b[0;31m \u001b[0mmap_to_id\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0matom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetProp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'molAtomMapNumber'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0matom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGetIdx\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 594\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_46290/14750037.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtemplate_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreaction\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mreaction\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreactions_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtemplate_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreaction\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mreaction\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreactions_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtemplate_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreaction\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mreaction\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreactions_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/tmp/ipykernel_46290/14750037.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtemplate_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreaction\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mreaction\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreactions_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtemplate_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreaction\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mreaction\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreactions_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtemplate_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreaction\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mreaction\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreactions_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/tmp/ipykernel_46290/3887948522.py\u001b[0m in \u001b[0;36mextract\u001b[0;34m(reaction)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Interrupted'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"template_train = [extract(reaction) for reaction in tqdm(reactions_train)]\n",
"template_test = [extract(reaction) for reaction in tqdm(reactions_test)]\n",
"template_val = [extract(reaction) for reaction in tqdm(reactions_val)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39f3bafa-d62d-44b8-a555-afc9648faaa9",
"metadata": {},
"outputs": [],
"source": [
"df_template_train = pd.DataFrame(template_train)\n",
"df_template_test = pd.DataFrame(template_test)\n",
"df_template_val = pd.DataFrame(template_val)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57c20e11-09ef-4615-9b33-52dd9da89c4a",
"metadata": {},
"outputs": [],
"source": [
"df_template_train.to_csv('template_train.csv', index = False)\n",
"df_template_test.to_csv('template_test.csv', index = False)\n",
"df_template_val.to_csv('template_val.csv', index = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0683ae9a-3100-4fed-b3ac-13bd2756f52e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
import pandas as pd
import numpy as np
from rdchiral import template_extractor
from tqdm import tqdm
import multiprocessing
cores = multiprocessing.cpu_count()
pool = multiprocessing.Pool(cores)
template_train = pd.read_csv('/notebooks/Codes/brs/USPTO/schneider50k/template_train.csv')
template_test = pd.read_csv('/notebooks/Codes/brs/USPTO/schneider50k/template_test.csv')
template_val = pd.read_csv('/notebooks/Codes/brs/USPTO/schneider50k/template_val.csv')
raw_data = pd.concat((template_train,template_test,template_val))
def extract(reaction):
try:
return template_extractor.extract_from_reaction(reaction)
except KeyboardInterrupt:
raise KeyboardInterrupt
except Exception as e:
return 'Err'
reactions = [{'_id': reaction[0], 'reactants': reaction[2].split('>>')[0], 'products': reaction[2].split('>>')[1]} for reaction in raw_data.values]
# templates = pool.map(extract, tqdm(reactions))
templates = [extract(reaction) for reaction in tqdm(reactions)]
print(123)
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
loss,accuracy
0.004026058129966259,61.567686462707464
0.0037154078017920256,60.84783043391322
0.003669698955491185,61.2877424515097
0.004232234321534634,61.187762447510494
0.004387203138321638,60.64787042591482
0.003995717503130436,61.2877424515097
0.00400707870721817,60.887822435512895
0.0044508883729577065,60.96780643871226
0.004024198278784752,61.2877424515097
0.003794693388044834,61.087782443511294
loss,accuracy
0.0035638760309666395,87.64247150569886
0.0031065840739756823,87.84243151369726
0.003881766926497221,87.5624875024995
0.0045944699086248875,87.76244751049789
0.0034541983623057604,87.84243151369726
0.004402834922075272,87.60247950409918
0.004115797113627195,87.80243951209758
0.0042515951208770275,87.54249150169966
0.0036142636090517044,87.74245150969806
0.005122682545334101,87.84243151369726
loss,accuracy
0.004491719417273998,78.94421115776845
0.0037716312799602747,79.2241551689662
0.003074276028200984,78.88422315536893
0.003592598019167781,79.40411917616477
0.0029139474499970675,79.3241351729654
0.004435064736753702,79.2241551689662
0.0036734032910317183,78.6642671465707
0.003433571895584464,78.96420715856829
0.003273188369348645,79.16416716656668
0.0036596572026610374,79.124175164967
loss,accuracy
0.003986424766480923,83.90321935612877
0.004227079451084137,83.80323935212958
0.004254240542650223,84.02319536092782
0.0042970869690179825,83.88322335532894
0.0040547908283770084,84.06318736252749
0.003926473669707775,83.94321135772846
0.003974240738898516,83.74325134973006
0.005630603991448879,83.98320335932813
0.004349200986325741,83.84323135372925
0.0041490886360406876,83.80323935212958
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "28d23d16-8a0f-4acb-81f5-591bd3d57262",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from argparse import Namespace\n",
"from rdkit import Chem, DataStructs\n",
"from rdkit.Chem import AllChem\n",
"from torch.utils.data import Dataset, DataLoader, TensorDataset\n",
"from torch.autograd import Variable\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import random\n",
"import torch\n",
"import pandas as pd\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import torch.nn.functional as F\n",
"import tqdm\n",
"from tqdm import tqdm, trange\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import OneHotEncoder, LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5a8d7fa1-942c-480d-bfe7-cba6344ae7b5",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using cuda:0 device\n"
]
}
],
"source": [
"# Get cpu or gpu device for training.\n",
"device_ids = range(6)\n",
"device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"print(\"Using {} device\".format(device))"
]
},
{
"cell_type": "markdown",
"id": "214943b7-9466-4cff-a01a-3294dcfe7b9e",
"metadata": {},
"source": [
"# Start building the reaction classification model"
]
},
{
"cell_type": "markdown",
"id": "e7288a96-8adc-45c3-bae2-385542f6c0a2",
"metadata": {},
"source": [
"## Read the data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d57fc395-6e2f-4a9f-af70-50b285601bc7",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"origin_data = pd.read_csv('./dataset/uspto50k/data_processed.csv')\n",
"df_products = pd.read_csv('./dataset/uspto50k/df_products_2048_2.csv').values[:,0:-1]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "18a38e2f-9307-4f83-9b3c-1fed4fdfbdf6",
"metadata": {},
"outputs": [],
"source": [
"products = origin_data['product'].values\n",
"labels = origin_data['label'].values.reshape(-1,1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8a7b171d-7c4a-461a-8d21-9b14a187a765",
"metadata": {},
"outputs": [],
"source": [
"vocab = np.array(list(set('0'.join(products.tolist()))))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c16e4143",
"metadata": {},
"outputs": [],
"source": [
"vocab = np.array(list(set('0'.join(products.tolist()))))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f89d69a4-4c70-430a-aa9f-613e4d1c5fff",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OneHotEncoder()"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoder = OneHotEncoder()\n",
"encoder.fit(vocab.reshape(-1,1))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "654feac5-6dd1-40b3-80ac-de9b9461c902",
"metadata": {},
"outputs": [],
"source": [
"config = Namespace(\n",
" fingerprint_len = 2048,\n",
" feature_size = 100,\n",
" in_channels = 48,\n",
" out_channels = 24,\n",
" dropout_rate = 0.1,\n",
" train_size = 0.8,\n",
" test_size = 0.1,\n",
" val_size = 0.1,\n",
" batch_size = 128,\n",
" num_class = int(max(labels) + 1),\n",
" vocal_size = len(vocab)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "28258e29-acb9-4dea-aed3-35b39e48968e",
"metadata": {},
"outputs": [],
"source": [
"def repair_product(product, feature_size):\n",
" if len(product) > feature_size:\n",
" return np.array(list(product[0: feature_size]))\n",
" else:\n",
" return np.array(list(product.ljust(feature_size,'0')))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "726ae3f8-ffd1-4f4d-9b27-20db9c3bdb06",
"metadata": {},
"outputs": [],
"source": [
"products = [repair_product(product, config.feature_size) for product in products]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "c6a9d5fa-bdd7-4e66-a4f5-3ec3abb096bf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 36%|███▌ | 18117/50016 [00:04<00:07, 4136.26it/s]"
]
}
],
"source": [
"encoded_products = np.array([encoder.transform(product.reshape(-1,1)).toarray().T for product in tqdm(products)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "829443c8-5ebc-4d47-bbca-5c554012bb70",
"metadata": {},
"outputs": [],
"source": [
"encoded_products = encoded_products.reshape(len(encoded_products),config.vocal_size*config.feature_size)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2458f247-ab5c-4ca5-b61e-1adb766a0ea5",
"metadata": {},
"outputs": [],
"source": [
"data = np.hstack((encoded_products,df_products,labels))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "829a1b14-b8ad-4d5a-b5c9-40ac2e184162",
"metadata": {},
"outputs": [],
"source": [
"def get_train_test_data(data, train_size, test_size, val_size):\n",
" keys = np.arange(len(data)).tolist()\n",
" index = np.arange(len(data))\n",
" label = data[:,-1]\n",
" value, key = np.unique(label, return_index = True)\n",
" \n",
" train_data_key = key.tolist()\n",
" rest_key = np.setdiff1d(keys, train_data_key).tolist()\n",
" test_data_key = random.sample(rest_key, int(test_size*len(data)))\n",
" rest_key = np.setdiff1d(rest_key, test_data_key).tolist()\n",
" val_data_key = random.sample(rest_key, int(val_size*len(data)))\n",
" rest_key = np.setdiff1d(rest_key, val_data_key).tolist()\n",
" train_data_key.extend(rest_key)\n",
" \n",
" train_data = data[train_data_key]\n",
" test_data = data[test_data_key]\n",
" val_data = data[val_data_key]\n",
" \n",
" return train_data, test_data, val_data, train_data_key, test_data_key, val_data_key"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bfcdf3a6-177a-4e7a-b2c8-6d0f8222e42e",
"metadata": {},
"outputs": [],
"source": [
"train_data, test_data, val_data, train_data_key, test_data_key, val_data_key = get_train_test_data(data, config.train_size, config.test_size, config.val_size)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8b89c38-bd51-40b0-a964-0df39d7c38cc",
"metadata": {},
"outputs": [],
"source": [
"class MyDataset(Dataset):\n",
" def __init__(self, data, config):\n",
" self.data = data\n",
" \n",
" self.X = torch.from_numpy(self.data[:,0:-1]).type(torch.FloatTensor)\n",
" self.y = torch.from_numpy(self.data[:, -1]).type(torch.LongTensor)\n",
" \n",
" \n",
" def __getitem__(self, index):\n",
" return self.X[index], self.y[index]\n",
"\n",
" def __len__(self):\n",
" return self.data.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b1ac171-8aaa-426d-b717-3bfbcade1564",
"metadata": {},
"outputs": [],
"source": [
"train_dataset = MyDataset(train_data,config)\n",
"test_dataset = MyDataset(test_data,config)\n",
"val_dataset = MyDataset(val_data,config)\n",
"\n",
"train_dataloader = DataLoader(dataset = train_dataset, batch_size = config.batch_size, shuffle = True)\n",
"test_dataloader = DataLoader(dataset = test_dataset, batch_size = config.batch_size, shuffle = True)\n",
"val_dataloader = DataLoader(dataset = val_dataset, batch_size = config.batch_size, shuffle = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95cc40b8-abf0-4d4c-9f8c-2948f06553b8",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"def train(dataloader, model, optimizer, loss_fn):\n",
" size = len(dataloader.dataset)\n",
" for batch, (X, y) in enumerate(dataloader):\n",
" X, y = X.to(device), y.to(device)\n",
"\n",
" # Compute prediction error\n",
" pred = model(X)\n",
"# print(pred)\n",
"# print(y)\n",
" loss = loss_fn(pred, y)\n",
"\n",
" # Backpropagation\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
" \n",
" return loss.item()\n",
"# if batch % 100 == 0:\n",
"# loss, current = loss.item(), batch * len(X)\n",
"# print(f\"loss: {loss:>7f} [{current:>5d}/{size:>5d}]\")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a02bbcc-797c-4f50-880b-445837cf3f71",
"metadata": {},
"outputs": [],
"source": [
"def accuracy(dataloader, model, loss_fn, k):\n",
" size = len(dataloader.dataset)\n",
" num_batches = len(dataloader)\n",
" model.eval()\n",
" test_loss, correct = 0, 0\n",
" with torch.no_grad():\n",
" for X, y in dataloader:\n",
" X, y = X.to(device), y.to(device)\n",
" pred = model(X)\n",
" test_loss += loss_fn(pred, y).item()\n",
" _, pred = pred.topk(k, 1, True, True)\n",
" correct += torch.eq(pred, y.view(-1, 1)).sum().float().item()\n",
" test_loss /= num_batches\n",
" correct /= size\n",
" return correct * 100"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6707968e-e8af-41f5-bb8a-26b6eee1e576",
"metadata": {},
"outputs": [],
"source": [
"def test(dataloader, model, loss_fn):\n",
" size = len(dataloader.dataset)\n",
" num_batches = len(dataloader)\n",
" model.eval()\n",
" test_loss, correct = 0, 0\n",
" with torch.no_grad():\n",
" for X, y in dataloader:\n",
" X, y = X.to(device), y.to(device)\n",
" pred = model(X)\n",
" test_loss += loss_fn(pred, y).item()\n",
" correct += (pred.argmax(1) == y).type(torch.float).sum().item()\n",
" test_loss /= num_batches\n",
" correct /= size\n",
"# print(f\"Test Error: \\n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \\n\")\n",
" return correct * 100"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "411ff8c9-9c0d-46c6-98ac-7af10e0a9ddf",
"metadata": {},
"outputs": [],
"source": [
"class OSPNet(nn.Module):\n",
" def __init__(self, config):\n",
" super(OSPNet, self).__init__()\n",
" self.is_training = True\n",
" self.dropout_rate = config.dropout_rate\n",
" self.num_class = config.num_class\n",
" self.config = config\n",
" \n",
" \n",
" self.convs_sequence = nn.ModuleList([\n",
" nn.Sequential(nn.Conv1d(in_channels=48, out_channels=24, kernel_size=h),\n",
" nn.BatchNorm1d(num_features=config.out_channels),\n",
" nn.Sigmoid(),\n",
" nn.MaxPool1d(kernel_size=config.feature_size - h)\n",
" )\n",
" for h in range(5,100,5)])\n",
" self.convs_finterprint = nn.ModuleList([\n",
" nn.Sequential(nn.Conv1d(in_channels=1, out_channels=40, kernel_size=h),\n",
" nn.BatchNorm1d(num_features=40),\n",
" nn.Sigmoid(),\n",
" nn.MaxPool1d(kernel_size=config.fingerprint_len - h)\n",
" )\n",
" for h in range(32,2048,32)])\n",
" self.fc = nn.Linear(in_features=2976, out_features=config.num_class)\n",
" \n",
" def forward(self, x):\n",
" split_index = config.feature_size*config.in_channels\n",
" encoded_products = x[:,0:split_index].reshape(x.shape[0],config.in_channels,config.feature_size)\n",
" fingerprint_products = x[:,split_index:].reshape(x.shape[0],1,config.fingerprint_len)\n",
" \n",
" \n",
" \n",
" out_encoded = [conv(encoded_products) for conv in self.convs_sequence]\n",
" out_encoded = torch.cat(out_encoded, dim=1)\n",
" out_encoded = out_encoded.view(-1, out_encoded.size(1)) \n",
" out_encoded = F.dropout(input=out_encoded, p=self.dropout_rate)\n",
" \n",
"# print(out_encoded.shape)\n",
" \n",
" out_fingerprint = [conv(fingerprint_products) for conv in self.convs_finterprint]\n",
" out_fingerprint = torch.cat(out_fingerprint, dim=1)\n",
" out_fingerprint = out_fingerprint.view(-1, out_fingerprint.size(1))\n",
" out_fingerprint = F.dropout(input=out_fingerprint, p=self.dropout_rate)\n",
" \n",
"# print(out_fingerprint.shape)\n",
" \n",
" out = torch.cat((out_encoded,out_fingerprint),1)\n",
" \n",
" out = self.fc(out)\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72ab74b2-630f-4161-a332-2245bf13e22e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"result_list = pd.DataFrame()\n",
"\n",
"runs = 10\n",
"k = 1\n",
"\n",
"result_list = []\n",
"highest_accuracy = -1\n",
"# PATH = './metanetx/classification_result_cnn_final.pkl'\n",
"\n",
"for run in range(runs):\n",
"\n",
" loss_list = []\n",
" accuracy_list = []\n",
"\n",
" model = OSPNet(config).to(device)\n",
"# model = nn.DataParallel(model, device_ids=device_ids)\n",
"\n",
" loss_fn = nn.CrossEntropyLoss()\n",
" optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)\n",
"\n",
"\n",
" epochs = trange(100)\n",
"\n",
" for epoch in epochs:\n",
" loss = train(train_dataloader, model, optimizer, loss_fn)\n",
" train_accuracy = test(train_dataloader, model, loss_fn)\n",
" val_accuracy = test(val_dataloader, model, loss_fn)\n",
" # if val_accuracy > highest_accuracy:\n",
" # highest_accuracy = val_accuracy\n",
" # torch.save(model,PATH)\n",
" \n",
"# test_accuracy = test(test_dataloader, model, loss_fn)\n",
" topk_accuracy = accuracy(test_dataloader, model, loss_fn, k)\n",
" loss_list.append(loss)\n",
" accuracy_list.append(topk_accuracy)\n",
" epochs.set_description(\"run: %d | epoch: %d | loss: %.4f | train_accuracy: %.4f%% | val_accuracy: %.4f%% | top_%d_accuracy: %.4f%%\"%(run + 1, epoch + 1, loss, train_accuracy, val_accuracy, k ,topk_accuracy))\n",
"\n",
" result_list.append([min(loss_list), max(accuracy_list)])\n",
"result_list = pd.DataFrame(result_list,columns=['loss', 'accuracy'])\n",
"result_list.to_csv('./results/uspto50k/result_final_top%d.csv'%k, index = False)\n",
"print('Done!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53f0dee7-c745-4270-8269-5e60045e2d25",
"metadata": {},
"outputs": [],
"source": [
"x = np.arange(100)\n",
"plt.plot(x,loss_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a0b737a-d1bc-43dc-85d0-94fcefcce88f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "61a57a4b5406d2de388e2f91097d4e4bcd7d5f4a46f53a795aa28a02eed27fc5"
},
"kernelspec": {
"display_name": "Python 3.8.3 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论