Source code for gwgen.parse_eecra

# -*- codng: utf-8 -*-
import re
import os.path as osp
import numpy as np
import pandas as pd
from collections import OrderedDict
from gwgen.utils import file_len
from gwgen._parseeecra import parseeecra

names = [
    'year', 'month', 'day', 'hour',
    'IB',
    'lat',
    'lon',
    'station_id',
    'LO',
    'ww',
    'N',
    'Nh',
    'h',
    'CL',
    'CM',
    'CH',
    'AM',
    'AH',
    'UM',
    'UH',
    'IC',
    'SA',
    'RI',
    'SLP',
    'WS',
    'WD',
    'AT',
    'DD',
    'EL',
    'IW',
    'IP']


[docs]def parse_file(ifile, year=None): """Parse a raw data file from EECRA and as a pandas DataFrame Parameters ---------- ifile: str The raw (uncompressed) data file year: int The first year in the data file Returns ------- pandas.DataFrame `ifile` parsed into a dataframe""" if year is None: m = re.match(r'\w{3}(\d{2})L', osp.basename(ifile)) if not m: raise TypeError( "Could not infer year of file %s! Use the 'year' " "parameter!" % (ifile, )) year = int(m.group(1)) year += 1900 if year > 60 else 2000 df = pd.DataFrame.from_dict(OrderedDict( zip(names, parseeecra.parse_file(ifile, year, file_len(ifile))))) return df
[docs]def extract_data(ids, src_dir, target_dir, years=range(1971, 2010), imonths=range(1, 13)): """Extract the data for the given EECRA stations This function extracts the data for the given `ids` from the EECRA data base stored in `src_dir` into one file for each *id* in `ids`. The resulting filename will be like *id.csv*. Parameters ---------- ids: np.ndarray of dtype int The numpy integer array with the station ids to extract src_dir: str The path to the source directory containing the raw (uncompressed) EECRA database target_dir: str The path to the output directory years: np.ndarray of dtype int The numpy integer array with the years to extract (by default, all years between 1971 and 2010) imonths: np.ndarray of dtype int The numpy integer array with the months to extract (by default, all from january to december) Returns ------- numpy.ndarray The paths of the filenames corresponding to ids""" ids = np.asarray(ids).astype(int) years = np.asarray(years).astype(int) imonths = np.asarray(imonths).astype(int) for arr in [ids, years, imonths]: if arr.ndim == 0: arr.reshape((1,)) parseeecra.extract_data( ids, osp.join(src_dir, ''), osp.join(target_dir, ''), years, imonths) return np.array([osp.join(src_dir, str(station_id) + '.csv') for station_id in ids])