Source code for gwgen.parse_eecra

# -*- codng: utf-8 -*-
import re
import os.path as osp
import numpy as np
import pandas as pd
from collections import OrderedDict
from gwgen.utils import file_len
from gwgen._parseeecra import parseeecra

names = [
    'year', 'month', 'day', 'hour',
    'IB',
    'lat',
    'lon',
    'station_id',
    'LO',
    'ww',
    'N',
    'Nh',
    'h',
    'CL',
    'CM',
    'CH',
    'AM',
    'AH',
    'UM',
    'UH',
    'IC',
    'SA',
    'RI',
    'SLP',
    'WS',
    'WD',
    'AT',
    'DD',
    'EL',
    'IW',
    'IP']


[docs]def parse_file(ifile, year=None):
    """Parse a raw data file from EECRA and as a pandas DataFrame

    Parameters
    ----------
    ifile: str
        The raw (uncompressed) data file
    year: int
        The first year in the data file

    Returns
    -------
    pandas.DataFrame
        `ifile` parsed into a dataframe"""
    if year is None:
        m = re.match(r'\w{3}(\d{2})L', osp.basename(ifile))
        if not m:
            raise TypeError(
                "Could not infer year of file %s! Use the 'year' "
                "parameter!" % (ifile, ))
        year = int(m.group(1))
        year += 1900 if year > 60 else 2000
    df = pd.DataFrame.from_dict(OrderedDict(
        zip(names, parseeecra.parse_file(ifile, year, file_len(ifile)))))
    return df


[docs]def extract_data(ids, src_dir, target_dir, years=range(1971, 2010),
                 imonths=range(1, 13)):
    """Extract the data for the given EECRA stations

    This function extracts the data for the given `ids` from the EECRA data
    base stored in  `src_dir` into one file for each *id* in `ids`. The
    resulting filename will be like *id.csv*.

    Parameters
    ----------
    ids: np.ndarray of dtype int
        The numpy integer array with the station ids to extract
    src_dir: str
        The path to the source directory containing the raw (uncompressed)
        EECRA database
    target_dir: str
        The path to the output directory
    years: np.ndarray of dtype int
        The numpy integer array with the years to extract (by default, all
        years between 1971 and 2010)
    imonths: np.ndarray of dtype int
        The numpy integer array with the months to extract (by default, all
        from january to december)

    Returns
    -------
    numpy.ndarray
        The paths of the filenames corresponding to ids"""
    ids = np.asarray(ids).astype(int)
    years = np.asarray(years).astype(int)
    imonths = np.asarray(imonths).astype(int)
    for arr in [ids, years, imonths]:
        if arr.ndim == 0:
            arr.reshape((1,))
    parseeecra.extract_data(
        ids, osp.join(src_dir, ''), osp.join(target_dir, ''), years, imonths)
    return np.array([osp.join(src_dir, str(station_id) + '.csv')
                     for station_id in ids])
Source code for gwgen.parse_eecra

gwgen

Navigation

Related Topics