view db/insertwunderground.c @ 7:9a14029b3782

- Wait longer (100msec) for a read, and reduce the number of retries to match. - Print out if we read OK after more than 1 attempt for diagnostic purposes.
author Daniel O'Connor <darius@dons.net.au>
date Thu, 11 Feb 2010 12:47:59 +1030
parents 9dab44dcb331
children
line wrap: on
line source

/*
 * Read in Wunderground comma-delimited files and convert them into INSERT
 * commands for MySQL.
 *
 * $Id: insertwunderground.c,v 1.5 2009/12/09 03:27:55 grog Exp $
 */
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>


#define BUFSIZE 1024
char attribute_list [BUFSIZE];
char buf [BUFSIZE];
char sql_command [BUFSIZE];
int attribute_count;
char *station;                                  /* name of station */
char *date;                                     /* and date from second parameter, if present */

/*
 * Convert the Wunderground field names to our table.  This is confused by the
 * fact that Wunderground can't make up its mind what to call the fields, so
 * multiple names are possible, and they can have spaces in them.  If we don't
 * care about the field, we have a null translation.
 *
 * Some fields are problems: time is really two fields, date and time, and
 * precipitation is sometimes in cm rather than mm.  We special-case them.
 */
struct attributes
{
  char *wundername;                             /* name supplied in file  */
  char *column_name;                            /* and the names we use */
} attributes [] = {
   {"Clouds", "clouds"},
   {"Conditions", "conditions"},
   {"Dew PointC", "outside_dewpoint"},
   {"DewpointC", "outside_dewpoint"},
   {"Events", "events"},
   {"Gust SpeedKm/h", "wind_gust"},
   {"HourlyPrecipMM", "rain"},
   {"Humidity", "outside_humidity"},
   {"PrecipitationCm", ""},
   {"PressurehPa", "pressure_msl"},       /* I assume this is correct */
   {"Sea Level PressurehPa", "pressure_msl"},
   {"SoftwareType", ""},
   {"TemperatureC", "outside_temp"},
   {"Time", "date"},
   {"TimeEST", "time"},
   {"VisibilityKm", "visibility"},
   {"Wind Direction", "wind_direction_text"},
   {"Wind SpeedKm/h", "wind_speed"},
   {"WindDirection", "wind_direction_text"},
   {"WindDirectionDegrees", "wind_direction"},
   {"WindSpeedGustKMH", "wind_gust"},
   {"WindSpeedKMH", "wind_speed"},
   {"dailyrainCM", "" }};

int columns = sizeof (attributes) / sizeof (struct attributes);

int field  [50];                                /* build up a table of the fields here */

void skipblanks (char **cp)
{
  while (**cp == ' ')
    (*cp)++;
}

int main (int argc, char *argv [])
{
  char *cp;                                     /* pointers in buffers */
  char *ep;                                     /* pointers in buffers */
  char *fp;
  char fieldname [64];
  int fields = 0;                               /* becomes number of fields in table */
  int myfield;
  int i;

  if (argc < 2)
  {
    fprintf (stderr, "Usage: $0 < file STATION_ID\n");
    exit (1);
  }
  station = argv [1];                           /* arg is station name */
  if (argc > 2)                                 /* date specified too */
    date = argv [2];

  /*
   * The first useful line should be a list of field names.  Read it in and
   * decide what we want to use.
   */
  do
  {
    if (! fgets (attribute_list, BUFSIZE - 1, stdin))
      exit (0);                                 /* empty file, just ignore  */
  }
  while (strlen (attribute_list) < 4);          /* I'm not sure how long the
                                                 * initial junk is, but this
                                                 * should cover it */
  /*
   * Untidy copy of field names.
   */
  cp = attribute_list;
  while (1)
  {
    fp = fieldname;
    skipblanks (&cp);
    while ((*cp != ',') && (*cp !='0') && (*cp !='<'))
      *fp++ = *cp++;                            /* copy the character */
    *fp = '\0';                                 /* and delimit it */
    for (i = 0; i < columns; i++)
      if (! strcmp (fieldname, attributes [i].wundername)) /* found it */
      {
        field  [fields] = i;
        break;
      }
    if (i == columns)                           /* not found */
    {
      fprintf (stderr, "Warning: can't find field name %s\n", fieldname);
      field  [fields] = -1;                     /* don't use this field */
    }
    if (strlen (attributes [i].column_name) == 0) /* no column */
      field [fields] = -1;                      /*  */
    if ((*cp == '<') || (*cp == '\0'))          /* end of useful info */
      break;
    cp++;                                       /* skip the delimiter */
    fields++;                                   /* done another field */
  }
  /*
   * On exit from the loop, fields contains the number of fields, and field []
   * is a translation table.
   */
  while (1)
  {
    if (! fgets (buf, BUFSIZE - 1, stdin))
      exit (0);                                 /* empty file, just ignore  */
    if (memcmp (buf, "20", 2) == 0)             /* we have a line starting with at date  */
    {
      char *time;

      /* walk through the date field and convert it to two fields */
      sprintf (sql_command, "REPLACE INTO remote_observations (station_id, date, time");
      /* first get the names */
      for (myfield = 1; myfield < fields; myfield++)
      {
        if (field [myfield] >= 0)
          sprintf (&sql_command [strlen (sql_command)], ", %s", attributes [field [myfield]].column_name);
      }
      sprintf (&sql_command [strlen (sql_command)], ") VALUES (");
      /* and now the values */
      cp = buf;
      while (*cp != ' ')
        cp++;
      *cp++ = '\0';                             /* split first field */
      skipblanks (&cp);                         /* this shouldn't be necessary */
      time = cp;                                /* second is time */
      while (*cp != ',')
        cp++;                                   /* delimit */
      *cp++ = '\0';                             /* delimit second field */
      /*
       * We now have buf pointing to the date, time pointing to the time, both
       * \0 delimited, and cp pointing to the rest of the buffer.
       */
      sprintf (&sql_command [strlen (sql_command)],
               "\"%s\", \"%s\", \"%s\"",
               station,
               buf,
               time );
      /*
       * Do the rest
       */
      for (myfield = 1; myfield < fields; myfield++)
      {
        skipblanks (&cp);
        ep = cp;
        while ((*ep != ',') && (*ep != '\n'))
          ep++;
        *ep++ = '\0';                           /* make a string of it */
        if (field [myfield] >= 0)
          sprintf (&sql_command [strlen (sql_command)], ", \"%s\"", cp);
        cp = ep;                                /* point to next field */
        /*
         * Strictly speaking this isn't necessary, since we have a counted
         * number of fields, but I don't trust the data.
         */
        if (*cp == '\0')
          break;
      }
      sprintf (&sql_command [strlen (sql_command)], ");");
      puts (sql_command);
    }
    else if (isdigit (buf [0]))                 /* must be a time  */
    {
      sprintf (sql_command, "REPLACE INTO remote_observations (station_id, date");


      /* first get the names */
      for (myfield = 0; myfield < fields; myfield++)
      {
        if (field [myfield] >= 0)
          sprintf (&sql_command [strlen (sql_command)], ", %s", attributes [field [myfield]].column_name);
      }
      sprintf (&sql_command [strlen (sql_command)], ") VALUES (\"%s\", \"%s\"",
               station,
               date );
      /*
       * Do the rest
       */
      cp = buf;
      for (myfield = 0; myfield < fields; myfield++)
      {
        skipblanks (&cp);
        ep = cp;
        while ((*ep != ',') && (*ep != '\n'))
          ep++;
        *ep++ = '\0';                           /* make a string of it */
        if (field [myfield] >= 0)
          sprintf (&sql_command [strlen (sql_command)], ", \"%s\"", cp);
        cp = ep;                                /* point to next field */
        /*
         * Strictly speaking this isn't necessary, since we have a counted
         * number of fields, but I don't trust the data.
         */
        if (*cp == '\0')
          break;
      }
      sprintf (&sql_command [strlen (sql_command)], ");");
      puts (sql_command);
    }
  }
  return 0;
}