Made the mistake of trying to help out a

Forum

Forum
Lounge
Made the mistake of trying to help out a

Made the mistake of trying to help out at SO again.

I guess I’ll never learn.

https://stackoverflow.com/questions/78396197/is-there-a-cleaner-way-to-write-this-c-code

Closed as “opinion-based”.

^{_{When the hoi-polloi learned to frob computer forums is when all semblance of reasoned thinking went out the window.}}

Duthomhas (13148)

Meh, here’s what I played with, because it looked fun.

First, I figured I’d properly tokenize those atomic symbols into something useful. GNU provides a super-handy utility called gperf which generates a perfect hash lookup. We can massage its output pretty heavily to get:

elements.h

#ifndef ATOMIC_ELEMENTS_H
#define ATOMIC_ELEMENTS_H


#if ('A' != 65)
	#error "elements.c requires ASCII alphabetic character codes."
#endif


struct Element
{
	const unsigned char number;     // Atomic Number: 1, 2, ...
	const char          symbol[3];  // Atomic Symbol: "H", "He", ...
//	const char *        name;       // Element Name:  "Hydrogen", "Helium", ...
//	const float         weight;     // Atomic Weight: 1.008, 4.002602, ...
//	...
};


extern const struct Element Elements[ 1+118 ];
// The Periodic Table, indexed by elements' atomic number.


unsigned atomic_symbol_to_number( const char * );
// Convert an element's atomic symbol ("H", "He", ...) to the corresponding atomic number.
// Returns the atomic number else 0.

const struct Element * atomic_symbol_to_element( const char * );
// Convert an element's atomic symbol ("H", "He", ...) to Element data.
// Returns a pointer into the Elements[] table else NULL.


#endif

elements.c

#include <iso646.h>
#include <string.h>
#include "elements.h"


const struct Element Elements[ 1+118 ] =
{
	// This is our Periodic Table of Elements
	// Indexed by element's atomic number
	{0,  ""  },
	{1,  "H" },{2,  "He"},{3,  "Li"},{4,  "Be"},{5,  "B" },{6,  "C" },{7,  "N" },{8,  "O" },
	{9,  "F" },{10, "Ne"},{11, "Na"},{12, "Mg"},{13, "Al"},{14, "Si"},{15, "P" },{16, "S" },
	{17, "Cl"},{18, "Ar"},{19, "K" },{20, "Ca"},{21, "Sc"},{22, "Ti"},{23, "V" },{24, "Cr"},
	{25, "Mn"},{26, "Fe"},{27, "Co"},{28, "Ni"},{29, "Cu"},{30, "Zn"},{31, "Ga"},{32, "Ge"},
	{33, "As"},{34, "Se"},{35, "Br"},{36, "Kr"},{37, "Rb"},{38, "Sr"},{39, "Y" },{40, "Zr"},
	{41, "Nb"},{42, "Mo"},{43, "Tc"},{44, "Ru"},{45, "Rh"},{46, "Pd"},{47, "Ag"},{48, "Cd"},
	{49, "In"},{50, "Sn"},{51, "Sb"},{52, "Te"},{53, "I" },{54, "Xe"},{55, "Cs"},{56, "Ba"},
	{57, "La"},{58, "Ce"},{59, "Pr"},{60, "Nd"},{61, "Pm"},{62, "Sm"},{63, "Eu"},{64, "Gd"},
	{65, "Tb"},{66, "Dy"},{67, "Ho"},{68, "Er"},{69, "Tm"},{70, "Yb"},{71, "Lu"},{72, "Hf"},
	{73, "Ta"},{74, "W" },{75, "Re"},{76, "Os"},{77, "Ir"},{78, "Pt"},{79, "Au"},{80, "Hg"},
	{81, "Tl"},{82, "Pb"},{83, "Bi"},{84, "Po"},{85, "At"},{86, "Rn"},{87, "Fr"},{88, "Ra"},
	{89, "Ac"},{90, "Th"},{91, "Pa"},{92, "U" },{93, "Np"},{94, "Pu"},{95, "Am"},{96, "Cm"},
	{97, "Bk"},{98, "Cf"},{99, "Es"},{100,"Fm"},{101,"Md"},{102,"No"},{103,"Lr"},{104,"Rf"},
	{105,"Db"},{106,"Sg"},{107,"Bh"},{108,"Hs"},{109,"Mt"},{110,"Ds"},{111,"Rg"},{112,"Cn"},
	{113,"Nh"},{114,"Fl"},{115,"Mc"},{116,"Lv"},{117,"Ts"},{118,"Og"},
};


static
const unsigned char hash_to_element[ 216 ] =
{
	// This is our hash table!
	// (key, value) --> (hash, index into Elements[])
	0,15,59,1,94,0,19,36,23,2,0,103,91,71,54,0,6,24,32,29,0,57,58,31,0,0,80,20,92,112,0,5,35,
	0,84,0,67,4,72,97,0,7,56,0,25,0,115,10,0,27,0,98,11,0,12,0,16,38,0,93,0,68,34,63,42,0,78,
	61,0,50,0,21,52,0,102,0,116,73,0,106,0,43,96,0,0,0,40,82,53,77,0,9,87,0,17,0,109,26,30,44,
	0,49,75,0,108,0,8,88,0,86,0,39,18,0,79,0,3,55,0,111,0,74,62,0,46,0,89,41,0,118,0,104,69,0,
	47,64,0,83,0,48,0,0,51,0,81,0,0,28,0,107,0,0,65,0,101,0,0,100,0,113,0,99,14,0,60,0,0,117,
	0,114,0,0,22,0,0,0,85,95,0,0,0,0,37,0,90,0,0,105,0,13,0,0,66,0,0,0,0,70,0,0,0,0,110,0,0,0,
	0,76,0,0,0,0,33,0,45
};


static
unsigned atomic_symbol_to_hash( const char * symbol )
{
	// Returns a (potentially valid) hash value in 1..214 else a (definitely invalid) 0
	static unsigned char xs[] =
	{
		215, 110,  30,  15, 100,  59,  90,  11,   2,  87, 215,   5,   9,  30,  40, 105,
		  0, 215,  95,  55,  65,  27,   7, 120,   7, 110,  84, 215, 215, 215, 215, 215,
		215,  10,  85,  14, 122,   5,  34,  22, 117, 105, 215,   7,  77,  65,  12,  32,
		 17, 215,   0, 100,  64,   2,  65, 215, 215,  90, 215, 215, 215, 215, 215, 215,
	};
	unsigned hash = 0;
	while (*symbol)
	{
		int x = (unsigned char) *symbol++;
		hash += 1 + xs[ ((x & 192) == 64) ? (x & 63) : 0 ];
	}
	return hash < 215 ? hash : 0;
}


unsigned atomic_symbol_to_number( const char * symbol )
{
	unsigned h = atomic_symbol_to_hash( symbol );
	return h and strcmp( symbol, Elements[ hash_to_element[ h ] ].symbol ) == 0
		? hash_to_element[ h ]
		: 0;
}


const struct Element * atomic_symbol_to_element( const char * symbol )
{
	unsigned h = atomic_symbol_to_hash( symbol );
	return h and strcmp( symbol, Elements[ hash_to_element[ h ] ].symbol ) == 0
		? (Elements + hash_to_element[ h ])
		: NULL;
}

Continued...

Duthomhas (13148)

Next we can focus on our input parser to produce a list of (atom number, count) pairs. We can make a little class like this:

atom-parser.h

#ifndef ATOM_PARSER_H
#define ATOM_PARSER_H

#include <stdbool.h>


// The language we will parse is:
//
//   L      ::= WS? ATOMS (WS '+' WS ATOMS)* WS?  // example: "O2 + H2O"
//   ATOMS  ::= ATOM+                             // examples: "H2O2"
//   ATOM   ::= (SYMBOL | GROUP) COUNT? ION?      // examples: "O2", "(OH)4"
//   SYMBOL ::= UPPER LOWER?                      // atomic symbol such as "K" or "Fe"
//   GROUP  ::= '(' ATOMS ')' | '[' ATOMS ']'     // stuff in parentheses
//   COUNT  ::= ('1'..'9') ('0'..'9')*            // unsigned integer number (just not "0")
//   ION    ::= '+' | '-'                         // ion charge
//   WS     ::= SP | FF | LF | CR | HT | VT       // whitespace
//
// Thus we can read things like:
//
//   (CH2O)6         glucose (sugar)
//   Al(OH)3         aluminum hydroxide
//   [Co(NH3)6]3+    complex cation
//   [CoCl4(NH3)2]-  complex anion
//   CH3OCH3 + H2O   dimethylether
//   [Fe(CN)6]4-     potassium ferrocyanide coordination complex
//
// All our parsing does is collect a list of (atom, count) pairs, possibly with duplicates.
// For example, dimethylether produces:
//
//   C1 H3 O1 C1 H3 H2 O2
//
// (Which can be later combined to get something like: H8 C2 O3.)
//
// Notice that we CANNOT handle abbreviations like "(en)" (ethylenediamine, or NH2CH2CH2NH2).
// It would be very easy to add that to the language and parser, though!


//-------------------------------------------------------------------------------------------------
struct atom
//-------------------------------------------------------------------------------------------------
// A handy linked list of the (element, count) pairs we are collecting:
{
	unsigned      number;  // Atomic element number ("H" --> 1, "He" --> 2, ...)
	unsigned      count;   // Number of atoms of this element
	struct atom * next;
};
typedef struct atom atom;

atom * make_atom( unsigned number, unsigned count, atom * next );
atom * free_atoms( atom * );


//-------------------------------------------------------------------------------------------------
struct atom_parser
//-------------------------------------------------------------------------------------------------
// This little object collects atoms from the input string.
{
	const char * s;      // source string
	atom *       atoms;  // list of atoms parsed from string
	const char * error;  // error message else NULL
};
typedef struct atom_parser atom_parser;

atom_parser make_atom_parser( const char * s );
bool parse_atoms( atom_parser * );
// Returns true if `s` was successfully parsed.
// Otherwise check the `parser.error` message and the position of `parser.s`.
// Don't forget to `free_atoms( parser.atoms )`.


#endif

Last edited on

Duthomhas (13148)

atom-parser.c

#include <ctype.h>
#include <iso646.h>
#include <stdlib.h>

#include "elements.h"
#include "atom-parser.h"


// Notice how very like a C++ class this design is:
//   Public functions are prototyped in the header file.
//   Private functions are marked local to this file with 'static'.
//
// Each parse_X() function implements a piece of the BNF language described
// in the header file.
//
// The parse functions return whether something was successfully parsed, or
// false on error.


atom * make_atom( unsigned number, unsigned count, atom * next )
{
	atom * p = malloc( sizeof(atom) );
	atom a = { number, count, next };
	if (p) *p = a;
	return p;
}


atom * free_atoms( atom * atoms )
{
	while (atoms)
	{
		atom * next = atoms->next;
		free( atoms );
		atoms = next;
	}
	return NULL;
}


atom_parser make_atom_parser( const char * s )
{
	atom_parser parser = { .s=s, .atoms=NULL, .error=NULL };
	return parser;
}


static bool error( atom_parser * parser, const char * message )
{
	parser->error = message;
	return false;
}


static bool add_atom( atom_parser * parser, unsigned atomic_number, unsigned count )
{
	// For our purposes, order doesn't matter, so we just tack each new atom onto the
	// head of the list as we get it. If order mattered we could easily track the tail
	// of the list to append items that way. (Or just reverse the list once collected.)
	atom * atom = make_atom( atomic_number, count, parser->atoms );
	if (!atom) return error( parser, "memory allocation failure" );
	parser->atoms = atom;
	return true;
}


static bool skip_whitespace( atom_parser * parser )
{
	bool ok = 0;
	while (*parser->s and isspace( (unsigned char) *parser->s ))
		parser->s += ok = 1;
	return ok;
}


static bool peek_char( atom_parser * parser, char c )
{
	return *parser->s == c;
}


static bool parse_char( atom_parser * parser, char c )
{
	bool ok = peek_char( parser, c );
	parser->s += ok;
	return ok;
}


static unsigned parse_COUNT_ION( atom_parser * parser )
{
	// COUNT ::= ('1'..'9') ('0'..'9')*
	const char * error_s = parser->s;
	bool has_digits = false;

		unsigned count = 0;
		while (isdigit( (unsigned char) *parser->s ))
		{
			count = count * 10 + ( *(parser->s)++ - '0' );
			has_digits = true;
		}

	if (has_digits and !count)
	{
		parser->s = error_s;
		return error( parser, "zero is not a valid atom count" );
	}

	// ION ::= '+' | '-'
	if (!parse_char( parser, '+' )) parse_char( parser, '-' );

	return count ? count : 1;  // ALWAYS RETURN A VALID COUNT!
}


static bool parse_SYMBOL_COUNT_ION( atom_parser * parser )
{
	// SYMBOL ::= UPPER LOWER?
	char symbol_name[ 3 ];
	const char * parser_s = parser->s;

	// An atomic symbol is an uppercase letter followed by a lowercase letter
	char * symbol = symbol_name;
	if (isupper( (unsigned char) *parser->s )) { *symbol++ = *(parser->s)++;
	if (islower( (unsigned char) *parser->s ))   *symbol++ = *(parser->s)++; }
	*symbol = '\0';

	if (!*symbol_name) return false;

	// Convert / Validate
	unsigned number = atomic_symbol_to_number( symbol_name );
	if (!number)
	{
		parser->s = parser_s;
		return error( parser, "invalid atomic symbol" );
	}

	return add_atom( parser, number, parse_COUNT_ION( parser ) );
}


static bool parse_ATOMS( atom_parser * );
// ↑ forward declaration for mutual recursion with parse_GROUP_COUNT_ION()


static bool parse_GROUP_COUNT_ION( atom_parser * parser )
{
	// GROUP ::= '(' ATOMS ')' | '[' ATOMS ']'

	if (!parse_char( parser, '(' ) and !parse_char( parser, '[' ))
		return false;

	// Match the open parenthesis
	char close_paren = (parser->s[-1] == '(') ? ')' : ']';

	// New atoms will be added before the current
	atom * end = parser->atoms;

	// Recurse to get the parenthesized atoms
	if (!parse_ATOMS( parser )) return false;

	// Parentheses must terminate with matching parenthesis
	if (!parse_char( parser, close_paren ))
		return error( parser, close_paren == ')' ? "expected ')'" : "expected ']'" );

	// Update the counts of all the grouped atoms
	unsigned count = parse_COUNT_ION( parser );
	for (atom * iter = parser->atoms;  iter != end;  iter = iter->next)
		iter->count *= count;

	return true;
}


static bool parse_ATOM( atom_parser * parser )
{
	// ATOM ::= (SYMBOL | GROUP) COUNT? ION?
	return
		parse_GROUP_COUNT_ION( parser ) or
		parse_SYMBOL_COUNT_ION( parser );
}


static bool parse_ATOMS( atom_parser * parser )
{
	// ATOMS ::= ATOM+

	// One ATOM required
	if (!parse_ATOM( parser ))
		return !parser->error and error( parser, "expected ATOM" );

	// Additional ATOMs are optional
	while (parse_ATOM( parser ))
		;

	return true;
}


bool parse_atoms( atom_parser * parser )
{
	// This public-facing function parses the top-level L part of our language specification:
	//   L ::= WS? ATOMS (WS '+' WS ATOMS)* WS?

	// Optional leading whitespace
	skip_whitespace( parser );

	// Nothing parsed or error?
	if (!parse_ATOMS( parser )) return false;

	// While (additional terms)
	while (skip_whitespace( parser ) and parse_char( parser, '+' ))
	{
		// An additional term must exist (unless there was an error)
		if (!skip_whitespace( parser ))
			return error( parser, "expected whitespace after '+'" );

		if (!parse_ATOMS( parser ))
			return !parser->error and error( parser, "unexpected end of input" );
	}

	// Errors propagate
	if (parser->error) return false;

	// Expect end of input
	switch (*parser->s)
	{
		case  0 : break;
		case ')': return error( parser, "unexpected ')'" );
		case ']': return error( parser, "unexpected ']'" );
		default:  return error( parser, "expected end of input or \" + \"" );
	}

	return true;
}

Last edited on

Duthomhas (13148)

Then we can pull it all together in:

main.c

#include <iso646.h>
#include <stdio.h>
#include <string.h>

#include "elements.h"
#include "atom-parser.h"


int print_error( const char * s, unsigned n, const char * message )
{
	fprintf( stderr, "ERROR: %s\n", s );
	fprintf( stderr, "%*s^ %s\n", n+7, " ", message );
	return 1;
}


int main( int argc, char ** argv )
{
	// A chemical formula to parse
	char s[ 100 ] = "";
	{
		// Provided by user at the command-line
		if (argc > 1)
		{
			for (int n = 1;  n < argc;  n++)
			{
				strcat( s, " " );
				strcat( s, argv[n] );
			}
		}
		// Else ask user for it
		else
		{
			printf( "formula? " );
			fflush( stdout );
			fgets( s, sizeof(s), stdin );
			char * p = strchr( s, '\n' );
			if (p) *p = '\0';
		}
	}

	// Convert the input string into a list of (atom,count) pairs
	atom * atoms = NULL;
	{
		atom_parser parser = make_atom_parser( s );
		if (!parse_atoms( &parser ) and parser.error)
		{
			free_atoms( parser.atoms );
			return print_error( s, parser.s - s, parser.error );
		}
		atoms = parser.atoms;
	}

	// Tally the counts for each element
	unsigned counts[ 1+118 ] = { 0 };  // 1..118 elements
	for (atom * iter = atoms;  iter;  iter = iter->next)
	{
		counts[ iter->number ] += iter->count;
	}
	
	// Done with these
	free_atoms( atoms );

	// Pretty print the results (in atomic order)
	for (int n = 1, k = 0;  n < 1+118;  n++)
		if (counts[n])
		{
			if (k++) printf( " + " );
			printf( "%s%u", Elements[n].symbol, counts[n] );
		}
	puts( "" );
}

Sure, you can write this much smaller. I initially did just that. But my small-n-sweet version was significantly less capable and failed to handle errors properly.

The formulas listed in the header:

formula? (CH2O)6
H12 + C6 + O6

formula? Al(OH)3
H3 + O3 + Al1

formula? [Co(NH3)6]3+
H54 + N18 + Co3

formula? [CoCl4(NH3)2]-
H6 + N2 + Cl4 + Co1

formula? CH3OCH3 + H2O
H8 + C2 + O2

formula? [Fe(CN)6]4-
C24 + N24 + Fe4

formula? Og+O
O1 + Og1

formula? Og-O
O1 + Og1

Errors:

formula? 
ERROR: 
       ^ expected ATOM

formula? 2
ERROR: 2
       ^ expected ATOM

formula? Oz
ERROR: Oz
       ^ invalid atomic symbol

formula? (Og
ERROR: (Og
          ^ expected ')'

formula? Og)
ERROR: Og)
         ^ unexpected ')'

formula? Og+ O
ERROR: Og+ O
           ^ expected end of input or " + "

formula? Og +O
ERROR: Og +O
           ^ expected whitespace after '+'

formula? Og++O
ERROR: Og++O
          ^ expected end of input or " + "

formula? [H2O)
ERROR: [H2O)
           ^ expected ']'

Meh.

I at least had fun playing with it.

Last edited on

Registered users can post here. Sign in or register to post.