PubChem_Similarity

Search for chemical structures in PubChem via a Fingerprint Tanimoto Similarity Search
% Vincent F. Scalfani, Serena C. Ralph, and Jason E. Bara
% The University of Alabama
% Tested with MATLAB R2020a, running Ubuntu 18.04 on March 30, 2020.

Define the PubChem API base URL

% PubChem API
api = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/';
% set MATLAB web options to a 30 second timeout
options = weboptions('Timeout', 30);
% Retrieve and display PNG Image of 1-Butyl-3-methyl-imidazolium; CID = 2734162
CID_SS_query = '2734162';
CID_url = [api 'cid/' CID_SS_query '/PNG'];
[CID_img,map] = imread(CID_url);
imshow(CID_img,map)
Replace the above CID value (CID_SS_query) with a different CID to customize.

Retrieve InChI and SMILES

% Retrieve InChI
inchi_url = [api 'cid/' CID_SS_query '/property/inchi/TXT'];
inchi = webread(inchi_url, options);
disp(inchi)
InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1
% Retrieve Isomeric SMILES
IS_url = [api 'cid/' CID_SS_query '/property/IsomericSMILES/TXT'];
IS = webread(IS_url, options);
disp(IS)
CCCCN1C=C[N+](=C1)C

Perform a Similarity Search

% Search for chemical structures by Similarity Search (SS),
% (2D Tanimoto threshold 95% to 1-Butyl-3-methyl-imidazolium; CID = 2734162)
api = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/';
SS_url = [api 'fastsimilarity_2d/cid/' CID_SS_query '/cids/JSON?Threshold=95'];
SS_CIDs = webread(SS_url,options);
SS_CIDs = num2cell(SS_CIDs.IdentifierList.CID)
SS_CIDs = 249×1 cell
 1
112971008
2304622
361347
411448496
511424151
611171745
72734161
8118785
92734236
102734162
11529334
1211788435
1311245926
1411160028
155245884
162734168
1791210418
1887560886
1987559770
2087106874
2124766551
2217870330
2316720567
2415557008
2515255204
2612392681
2712392676
2811448364
2911277167
3011031767
3110608883
3210537570
3310513048
3410313448
3510313447
3610154187
374183883
38139254006
39134345956
40122625623
41121299516
42118952202
43118057427
44117890836
45117703152
46117684660
47102147231
4890912888
4989713026
5089678233
5189432682
5288864524
5388236103
5487942618
5587806569
5687790333
5787789992
5887789923
5987789740
6087754289
6187754264
6287690425
6387688227
6487572548
6587572214
6687572213
6787509019
6887397668
6987388314
7087325711
7187308565
7287222859
7387181405
7487181202
7587181050
7687173651
7787125511
7887125508
7987121545
8087121544
8187121543
8287121443
8387121324
8487121318
8587121317
8687121316
8787121297
8887121296
8987121295
9087105369
9187099925
9287096071
9387092336
9469317070
9568379078
9667674484
9766751376
9860860613
9960103428
10059872702
In the above SS_url value, you can adjust to the desired Tanimoto threshold (i.e., 97, 90, etc.)
% set a CID limit to 25 max
The CID limit of 25 was added as an initial testing safety for time consideration. This limit can be increased.
number_SS_CIDs = length(SS_CIDs)
number_SS_CIDs = 249
if number_SS_CIDs > 25
SS_CIDs = SS_CIDs(1:25)
else
disp('Number of SS_CIDs not changed')
end
SS_CIDs = 25×1 cell
 1
112971008
2304622
361347
411448496
511424151
611171745
72734161
8118785
92734236
102734162
11529334
1211788435
1311245926
1411160028
155245884
162734168
1791210418
1887560886
1987559770
2087106874
2124766551
2217870330
2316720567
2415557008
2515255204

Retrieve Identifier and Property Data

% Create an identifier/property dataset from Similarity Search results
% Retrieve the following data from CID hit results:
% InChI, Isomeric SMILES, MW, Heavy Atom Count, Rotable Bond Count, and
% Charge
% setup a for loop that processes each CID one-by-one
for r = 1:length(SS_CIDs)
CID = SS_CIDs{r};
% define api calls
api = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/';
CID_InChI_url = [api 'cid/' num2str(CID) '/property/InChI/TXT'];
CID_IsoSMI_url = [api 'cid/' num2str(CID) '/property/IsomericSMILES/TXT'];
CID_MW_url = [api 'cid/' num2str(CID) '/property/MolecularWeight/TXT'];
CID_HeavyAtomCount_url = [api 'cid/' num2str(CID) '/property/HeavyAtomCount/TXT'];
CID_RotatableBondCount_url = [api 'cid/' num2str(CID) '/property/RotatableBondCount/TXT'];
CID_Charge_url = [api 'cid/' num2str(CID) '/property/Charge/TXT'];
Additional property data can be collected by defining new api calls, for example, if you want TPSA data:
% CID_TPSA_url = [api 'cid/' num2str(CID) '/property/TPSA/TXT'];
% retrieve identifer and property data
try
CID_InChI = webread(CID_InChI_url,options);
catch ME
CID_InChI = 'not found'
end
% be polite to PubChem server
n = 0.5;
pause(n)
try
CID_IsoSMI = webread(CID_IsoSMI_url,options);
catch ME
CID_IsoSMI = 'not found'
end
n = 0.5;
pause(n)
try
CID_MW = webread(CID_MW_url,options);
catch ME
CID_MW = 'not found'
end
n = 0.5;
pause(n)
try
CID_HeavyAtomCount = webread(CID_HeavyAtomCount_url,options);
catch ME
CID_HeavyAtomCount = 'not found'
end
n = 0.5;
pause(n)
try
CID_RotatableBondCount = webread(CID_RotatableBondCount_url,options);
catch ME
CID_RotatableBondCount = 'not found'
end
n = 0.5;
pause(n)
try
CID_Charge = webread(CID_Charge_url,options);
catch ME
CID_Charge = 'not found'
end
n = 0.5;
pause(n)
% add property data to SS_CIDs data array
% column numbers indicate where the data will be stored.
% For example, the MW will be placed in column 4. r increases
% by 1 on each iteration, so the first CID_MW value gets stored in
% {1,4}, the second in {2,4}, the third in {3,4}, etc.
SS_CIDs{r,2} = CID_InChI;
SS_CIDs{r,3} = CID_IsoSMI;
SS_CIDs{r,4} = CID_MW;
SS_CIDs{r,5} = CID_HeavyAtomCount;
SS_CIDs{r,6} = CID_RotatableBondCount;
SS_CIDs{r,7} = CID_Charge;
% to add more data, simply index into the next column
% SS_CIDs{r,8} = CID_TPSA;
end

Compile Data into a Table

% convert cell array to string and remove leading and trailing white space
SS_CIDs_string = strtrim(string(SS_CIDs));
% convert to table
SSq_table = array2table(SS_CIDs_string, 'VariableNames',{'CID', 'InChI','IsoSMI','MW',...
'HeavyAtomCount','RotatableBondCount','Charge'})
SSq_table = 25×7 table
 CIDInChIIsoSMIMWHeavyAtomCountRotatableBondCountCharge
1"12971008""InChI=1S/C7H13N2.HI/c1-3-4-9-6-5-8(2)7-9;/h5-7H,3-4H2,1-2H3;1H/q+1;/p-1""CCCN1C=C[N+](=C1)C.[I-]""252.100000""10""2""0"
2"304622""InChI=1S/C8H14N2/c1-3-4-6-10-7-5-9-8(10)2/h5,7H,3-4,6H2,1-2H3""CCCCN1C=CN=C1C""138.210000""10""3""0"
3"61347""InChI=1S/C7H12N2/c1-2-3-5-9-6-4-8-7-9/h4,6-7H,2-3,5H2,1H3""CCCCN1C=CN=C1""124.180000""9""3""0"
4"11448496""InChI=1S/C8H15N2.HI/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H/q+1;/p-1""CCCCN1C=C[N+](=C1)C.[I-]""266.120000""11""3""0"
5"11424151""InChI=1S/C8H15N2.CHNS/c1-3-4-5-10-7-6-9(2)8-10;2-1-3/h6-8H,3-5H2,1-2H3;3H/q+1;/p-1""CCCCN1C=C[N+](=C1)C.C(#N)[S-]""197.300000""13""3""0"
6"11171745""InChI=1S/C8H15N2.C2N3/c1-3-4-5-10-7-6-9(2)8-10;3-1-5-2-4/h6-8H,3-5H2,1-2H3;/q+1;-1""CCCCN1C=C[N+](=C1)C.C(=[N-])=NC#N""205.260000""15""3""0"
7"2734161""InChI=1S/C8H15N2.ClH/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H/q+1;/p-1""CCCCN1C=C[N+](=C1)C.[Cl-]""174.670000""11""3""0"
8"118785""InChI=1S/C6H10N2/c1-2-4-8-5-3-7-6-8/h3,5-6H,2,4H2,1H3""CCCN1C=CN=C1""110.160000""8""2""0"
9"2734236""InChI=1S/C8H15N2.BrH/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H/q+1;/p-1""CCCCN1C=C[N+](=C1)C.[Br-]""219.120000""11""3""0"
10"2734162""InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1""CCCCN1C=C[N+](=C1)C""139.220000""10""3""1"
11"529334""InChI=1S/C8H14N2/c1-2-3-4-6-10-7-5-9-8-10/h5,7-8H,2-4,6H2,1H3""CCCCCN1C=CN=C1""138.210000""10""4""0"
12"11788435""InChI=1S/C8H15N2.H2O/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H2/q+1;/p-1""CCCCN1C=C[N+](=C1)C.[OH-]""156.230000""11""3""0"
13"11245926""InChI=1S/C8H15N2.Br2.BrH/c1-3-4-5-10-7-6-9(2)8-10;1-2;/h6-8H,3-5H2,1-2H3;;1H/q+1;;/p-1""CCCCN1C=C[N+](=C1)C.[Br-].BrBr""378.930000""13""3""0"
14"11160028""InChI=1S/C7H13N2.BrH/c1-3-4-9-6-5-8(2)7-9;/h5-7H,3-4H2,1-2H3;1H/q+1;/p-1""CCCN1C=C[N+](=C1)C.[Br-]""205.100000""10""2""0"
15"5245884""InChI=1S/C7H13N2/c1-3-4-9-6-5-8(2)7-9/h5-7H,3-4H2,1-2H3/q+1""CCCN1C=C[N+](=C1)C""125.190000""9""2""1"
16"2734168""InChI=1S/C9H17N2/c1-4-5-6-11-8-7-10(3)9(11)2/h7-8H,4-6H2,1-3H3/q+1""CCCCN1C=C[N+](=C1C)C""153.240000""11""3""1"
17"91210418""InChI=1S/C8H14IN2/c1-3-4-5-11-7-6-10(2)8(11)9/h6-7H,3-5H2,1-2H3/q+1""CCCCN1C=C[N+](=C1I)C""265.110000""11""3""1"
18"87560886""InChI=1S/C9H15N2.BrH/c1-3-5-6-11-8-7-10(4-2)9-11;/h4,7-9H,2-3,5-6H2,1H3;1H/q+1;/p-1""CCCC[N+]1=CN(C=C1)C=C.[Br-]""231.130000""12""4""0"
19"87559770""InChI=1S/C9H15N2.ClH/c1-3-5-6-11-8-7-10(4-2)9-11;/h4,7-9H,2-3,5-6H2,1H3;1H/q+1;/p-1""CCCC[N+]1=CN(C=C1)C=C.[Cl-]""186.680000""12""4""0"
20"87106874""InChI=1S/C13H25N2/c1-3-5-7-9-14-11-12-15(13-14)10-8-6-4-2/h11-13H,3-10H2,1-2H3/q+1""CCCCCN1C=C[N+](=C1)CCCCC""209.350000""15""8""1"
21"24766551""InChI=1S/C9H15N2/c1-3-5-6-11-8-7-10(4-2)9-11/h4,7-9H,2-3,5-6H2,1H3/q+1""CCCC[N+]1=CN(C=C1)C=C""151.230000""11""4""1"
22"17870330""InChI=1S/C8H15N3/c1-10(2)5-3-6-11-7-4-9-8-11/h4,7-8H,3,5-6H2,1-2H3""CN(C)CCCN1C=CN=C1""153.220000""11""4""0"
23"16720567""InChI=1S/C10H19N2.BrH/c1-3-5-7-12-9-8-11(10-12)6-4-2;/h8-10H,3-7H2,1-2H3;1H/q+1;/p-1""CCCCN1C=C[N+](=C1)CCC.[Br-]""247.180000""13""5""0"
24"15557008""InChI=1S/C8H14N2/c1-3-5-8-9-6-7-10(8)4-2/h6-7H,3-5H2,1-2H3""CCCC1=NC=CN1CC""138.210000""10""3""0"
25"15255204""InChI=1S/C11H21N2.ClH/c1-3-5-7-12-9-10-13(11-12)8-6-4-2;/h9-11H,3-8H2,1-2H3;1H/q+1;/p-1""CCCCN1C=C[N+](=C1)CCCC.[Cl-]""216.750000""14""6""0"
% rearrange table
SSq_table2 = SSq_table(:, {'IsoSMI' 'CID' 'InChI' 'MW' 'HeavyAtomCount' 'RotatableBondCount' 'Charge'})
SSq_table2 = 25×7 table
 IsoSMICIDInChIMWHeavyAtomCountRotatableBondCountCharge
1"CCCN1C=C[N+](=C1)C.[I-]""12971008""InChI=1S/C7H13N2.HI/c1-3-4-9-6-5-8(2)7-9;/h5-7H,3-4H2,1-2H3;1H/q+1;/p-1""252.100000""10""2""0"
2"CCCCN1C=CN=C1C""304622""InChI=1S/C8H14N2/c1-3-4-6-10-7-5-9-8(10)2/h5,7H,3-4,6H2,1-2H3""138.210000""10""3""0"
3"CCCCN1C=CN=C1""61347""InChI=1S/C7H12N2/c1-2-3-5-9-6-4-8-7-9/h4,6-7H,2-3,5H2,1H3""124.180000""9""3""0"
4"CCCCN1C=C[N+](=C1)C.[I-]""11448496""InChI=1S/C8H15N2.HI/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H/q+1;/p-1""266.120000""11""3""0"
5"CCCCN1C=C[N+](=C1)C.C(#N)[S-]""11424151""InChI=1S/C8H15N2.CHNS/c1-3-4-5-10-7-6-9(2)8-10;2-1-3/h6-8H,3-5H2,1-2H3;3H/q+1;/p-1""197.300000""13""3""0"
6"CCCCN1C=C[N+](=C1)C.C(=[N-])=NC#N""11171745""InChI=1S/C8H15N2.C2N3/c1-3-4-5-10-7-6-9(2)8-10;3-1-5-2-4/h6-8H,3-5H2,1-2H3;/q+1;-1""205.260000""15""3""0"
7"CCCCN1C=C[N+](=C1)C.[Cl-]""2734161""InChI=1S/C8H15N2.ClH/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H/q+1;/p-1""174.670000""11""3""0"
8"CCCN1C=CN=C1""118785""InChI=1S/C6H10N2/c1-2-4-8-5-3-7-6-8/h3,5-6H,2,4H2,1H3""110.160000""8""2""0"
9"CCCCN1C=C[N+](=C1)C.[Br-]""2734236""InChI=1S/C8H15N2.BrH/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H/q+1;/p-1""219.120000""11""3""0"
10"CCCCN1C=C[N+](=C1)C""2734162""InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1""139.220000""10""3""1"
11"CCCCCN1C=CN=C1""529334""InChI=1S/C8H14N2/c1-2-3-4-6-10-7-5-9-8-10/h5,7-8H,2-4,6H2,1H3""138.210000""10""4""0"
12"CCCCN1C=C[N+](=C1)C.[OH-]""11788435""InChI=1S/C8H15N2.H2O/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H2/q+1;/p-1""156.230000""11""3""0"
13"CCCCN1C=C[N+](=C1)C.[Br-].BrBr""11245926""InChI=1S/C8H15N2.Br2.BrH/c1-3-4-5-10-7-6-9(2)8-10;1-2;/h6-8H,3-5H2,1-2H3;;1H/q+1;;/p-1""378.930000""13""3""0"
14"CCCN1C=C[N+](=C1)C.[Br-]""11160028""InChI=1S/C7H13N2.BrH/c1-3-4-9-6-5-8(2)7-9;/h5-7H,3-4H2,1-2H3;1H/q+1;/p-1""205.100000""10""2""0"
15"CCCN1C=C[N+](=C1)C""5245884""InChI=1S/C7H13N2/c1-3-4-9-6-5-8(2)7-9/h5-7H,3-4H2,1-2H3/q+1""125.190000""9""2""1"
16"CCCCN1C=C[N+](=C1C)C""2734168""InChI=1S/C9H17N2/c1-4-5-6-11-8-7-10(3)9(11)2/h7-8H,4-6H2,1-3H3/q+1""153.240000""11""3""1"
17"CCCCN1C=C[N+](=C1I)C""91210418""InChI=1S/C8H14IN2/c1-3-4-5-11-7-6-10(2)8(11)9/h6-7H,3-5H2,1-2H3/q+1""265.110000""11""3""1"
18"CCCC[N+]1=CN(C=C1)C=C.[Br-]""87560886""InChI=1S/C9H15N2.BrH/c1-3-5-6-11-8-7-10(4-2)9-11;/h4,7-9H,2-3,5-6H2,1H3;1H/q+1;/p-1""231.130000""12""4""0"
19"CCCC[N+]1=CN(C=C1)C=C.[Cl-]""87559770""InChI=1S/C9H15N2.ClH/c1-3-5-6-11-8-7-10(4-2)9-11;/h4,7-9H,2-3,5-6H2,1H3;1H/q+1;/p-1""186.680000""12""4""0"
20"CCCCCN1C=C[N+](=C1)CCCCC""87106874""InChI=1S/C13H25N2/c1-3-5-7-9-14-11-12-15(13-14)10-8-6-4-2/h11-13H,3-10H2,1-2H3/q+1""209.350000""15""8""1"
21"CCCC[N+]1=CN(C=C1)C=C""24766551""InChI=1S/C9H15N2/c1-3-5-6-11-8-7-10(4-2)9-11/h4,7-9H,2-3,5-6H2,1H3/q+1""151.230000""11""4""1"
22"CN(C)CCCN1C=CN=C1""17870330""InChI=1S/C8H15N3/c1-10(2)5-3-6-11-7-4-9-8-11/h4,7-8H,3,5-6H2,1-2H3""153.220000""11""4""0"
23"CCCCN1C=C[N+](=C1)CCC.[Br-]""16720567""InChI=1S/C10H19N2.BrH/c1-3-5-7-12-9-8-11(10-12)6-4-2;/h8-10H,3-7H2,1-2H3;1H/q+1;/p-1""247.180000""13""5""0"
24"CCCC1=NC=CN1CC""15557008""InChI=1S/C8H14N2/c1-3-5-8-9-6-7-10(8)4-2/h6-7H,3-5H2,1-2H3""138.210000""10""3""0"
25"CCCCN1C=C[N+](=C1)CCCC.[Cl-]""15255204""InChI=1S/C11H21N2.ClH/c1-3-5-7-12-9-10-13(11-12)8-6-4-2;/h9-11H,3-8H2,1-2H3;1H/q+1;/p-1""216.750000""14""6""0"
% data export as tabbed text file
% prompt user to select folder for data export
save_folder = uigetdir;
% change directory to selected folder
cd(save_folder)
writetable(SSq_table2,'MATLAB_Similarityq_results.txt','Delimiter','tab')

Retrieve Images of CID Compounds from Similarity Search

% loop through hit CIDs and show images
for r = 1:length(SS_CIDs)
CID = SS_CIDs{r};
api = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/';
CID_url = [api 'cid/' num2str(CID) '/PNG'];
try
% retrieve CID PNG image and display
[CID_img,map] = imread(CID_url);
figure;
imshow(CID_img,map)
drawnow;
title(num2str(CID));
% be polite to PubChem server
n = 0.5;
pause(n);
catch
disp('CID image not found')
disp('Execution will continue')
end
end