• <xmp id="om0om">
  • <table id="om0om"><noscript id="om0om"></noscript></table>
  • Data Center / Cloud

    ???? ??? ??: RAPIDS cuDF? ?? ??? ??? ??

    Reading Time: 5 minutes

    ? ???? ???? ??? ??? ?? ???? ?????:

    ????? ?? ??? ???? ??? ?? ??? ??? ??(EDA) ???????? pandas? ??? ?????? ?? ???? ??? ?? ?????? RAPIDS cuDF? ??? ?? ???? ??? ?? ? ????. ??? ???? ?????? ??? ???? ??? ?? ??? ??? ??? ?? ??? ??? ?? ???, RAPIDS? ???? ? ?? ??? ?? ?????.

    RAPIDS cuDF? ???? ?? ??? ??? ?? ‘????’ ??? ??? ?? ??? ?? ??? ?? ? ????. ??? ??? ??? ???? ?????? Apache Spark? Dask? ?? ??? ?? ??? ??? ???? ????.

    ??? ???? ??????

    ? ????? ??? ???? ???? ?? ??(ML) ?? ??? ???? ??? ??? ???? ?? ??? ?? ?????.

    ??? ???? ???? ????. ?????? ?? ??, ?? ?? ??, ?? ?? ?? ? ??? ??? ??? ???? ??? ?????.

    ?????? ??? ??? ????? ? ??? ??? ? ?? ??? ???? ?????. ????? ???? ??? ???? ???? ??? ???? ?? ?? ??? ????? ??? ??? ? ?? ???.

    ??? ?? ???? ML ?? ??? ??? ???? ?? ????, ?? ?? ?? ??? ???????:

    • ?? ??? ??? ?? ?? ?? ??
    • ?? ??? ?? ??
    • ?? ??? ?? ?? ??
    • ??? ??? ?? ?? ???

    ??? ??? ?? ???? ???? ?? ?? ?? ???? ?? ???? ??? ??? ??? ??? ???? ? ?? ??? ??? ???? ???? ???? ?? ??? ????. ??? ???? ??? ?? ?? ??? ??? ??? ?? ???? ???? ??????, ??? ???? ???? ?? ???? ????? ?? ??? ????.

    pandas? ??? ??? ??? ? ?? ???? ???? ??? ??? ?????, ?? ??? ??? ??? ????? ?? ?? ???? ? ??? ?? ?? ????? ??? ????. ?? ?? ??? ?? ?? ??? ?? ?? ??? ?? ??? ??? ??? ??? ???? ?? ??? ??? ? ????. ??? ???? ??? ??? ???? ??? ???? ??? ? ??? ???? ??? ??? ? ????.

    ??? ??? ????? ??? ??? ??? cuDF? ?? ???? ????. ??? ??? API? ???? ?? ?????? ???? ?? 40? ?? ??? ??? ? ???? ?? ??? ?????? ?? ??? ??? ??? ??? ? ????.

    RAPIDS cuDF? ??? ???

    ? ?????? RAPIDS cuDF? ???? ??? ? ???? ??? ?? ?? ??? ? ?? ??? ????? ?? ??? ??? ???? ??? ?? ??? ?? ??? ?????. ? ???? ????? ?? ??? ?? ?? ?? ??? ??? ?? ??? ??? ????, RAPIDS GitHub ??????? ??? ? ????.

    ?? ???? RAPIDS cuDF? 13?? ??? ???????(??? ??? ? ???? ???? ?? ???? ?? ??). ????? ?? ?????? ????? ??? ?????.

    ?? ??? ???? ??? ?? ??? ???? ??? ????. 1??? ??? ??? 5? ??? ??? ? ??? ??? ??? ?? ?? ?? ? ????.

    ??? ??

    Meteonet? 2016??? 2018??? ?? ??? ??? ?? ???? ???? ???? ?? ?? ??? ????, ????? ???? ?? ???? ???? ????. ??? ? 12.5GB???.

    ?? ?? ??

    ? ?????? ? ?? ???? ?? ?? ??? ?? ??? ?? ???? ?? ??? ????? ??? ?????. ???? ?? ??? ??, ??? ?? ?? ??? ??? ? ? ?? ??????.

    ? ???? ????? ???? ??? ??? ??? ???? ??????? ??? ????. ? ??? ?? ??? ????? ????? ????:

    • ??? ???? ??? ?????.
    • ???? ?? ??????.
    • ?? ??? ??? ?????.

    ? ?????? ????? ??? ????? ?????, cuDF? ??? ??? ??? ???? ??? ? ?? ??? ???? ?????.

    1??. ??? ??? ?? ??

    ?? ?? ??? ???? ? ??? ??? ???? ?????:

    # Import the necessary packages
    import cudf
    import cupy as cp
    import pandas as pd

    ???? CSV ???? ?????.

    ## Read in data
    gdf = cudf.read_csv('./SE_data.csv')

    ?? ?? ?? ????? ??, ??, ??? ??? ??? ??? ?????.?

    gdf = gdf.drop(columns=['dd','precip','td','psl'])

    ?? ?? ????? ??? ? ??? ?? ??? ?????. ?? ?? ??/?? ??? ???? ???? ? ?? ??? ?????. ?? ?? ?? 5?? ?? ???? ?? ?? ??? ????? ? ?? ??? ??? ??? ?????.

    # Change the date column to the datetime data type. Look at the DataFrame info
    gdf['date'] = cudf.to_datetime(gdf['date'])
    gdf.head()
    Gdf.shape
    number_stalatlonheight_stadateffhut
    0102700345.835.11196.02016-01-01<NA>98.0279.05
    1103300246.095.81350.02016-01-010.099.0278.35
    2103400445.775.69330.02016-01-010.0100.0279.15
    3107200146.205.29260.02016-01-01<NA><NA>276.55
    4108900145.985.33252.02016-01-010.095.0279.55
    ? 1. ??? ??? ?? ?? ?? ???? ?? ??

    ??

    ??? ??? ??(127515796, 8)? 127,515,796?? ?? 8?? ?? ?????. ?? ??? ??? ??? ??? ????? ???? ??? ??? ???? ?? ? ? ???? ??? ??? ? ????.

    ## Investigate the sampling frequency with the diff() function to calculate the time diff
    ## dt.seconds, which is used to find the seconds value in the datetime frame. Then apply the 
    ## max() function to calculate the maximum date value of the series.
    delta_mins = gdf['date'].diff().dt.seconds.max()/60
    print(f"The dataset collection covers from {gdf['date'].min()} to {gdf['date'].max()} with {delta_mins} minute sampling interval")

    ??? ??? 6? ??? ???? 2016-01-01T00:00:00.000000000 ?? 2018-12-31T23:54:00.000000000,??? ?? ???? ?????. ?? ??? ??? ??? ??? ????? ?????.

    ??? ??? ?? ???? ??? ???? ???? ?? ??? ?????. ?? ??? ??? ?? ???? ??? ?????.

    gdf['year'] = gdf['date'].dt.year
    gdf['month'] = gdf['date'].dt.month
    gdf['day'] = gdf['date'].dt.day
    gdf['hour'] = gdf['date'].dt.hour
    gdf['mins'] = gdf['date'].dt.minute
    gdf.tail

    ?? ???? ???? ??, ?, ?? ?? ?????. ??? ???? ?? ?? ??? ?? ? ???? ??? ? ????.

    number_stalatlonheight_stadateffhutyearmonthdayhourmins
    1275157918408600143.8115.146672.02018-12-31 23:54:003.785.0276.95201812312354
    1275157928408700144.1454.86155.02018-12-31 23:54:0011.480.0281.05201812312354
    1275157938409400144.2895.131392.02018-12-31 23:54:003.668.0280.05201812312354
    1275157948410700244.0415.493836.02018-12-31 23:54:000.691.0270.85201812312354
    1275157958415000144.3374.905141.02018-12-31 23:54:006.784.0280.45201812312354
    ? 2. ?? ??? ?? ??? ?? ??

    ??? ?? ?? ??? ????? ???? ????? ??????? ??? ???.

    # Use the cupy.logical_and(...) function to select the data from a specific time range.
    import pandas as pd
    start_time = pd.Timestamp('2017-02-01T00')
    end_time = pd.Timestamp('2018-11-01T00')
    station_id = 84086001
    gdf_period = gdf.loc[cp.logical_and(cp.logical_and(gdf['date']>start_time,gdf['date']<end_time),gdf['number_sta']==station_id)]
    gdf_period.shape
    (146039, 13)

    13?? ??? 146,039?? ?? ??? ??? ???? ????? ???????.

    2??. ??? ????

    ?? ??????? ??????? ??? ???? ??? ?????. ???? 6??? ???????, ? ?? ???? ? ?? ??? ????? ???.

    ?? ??? ???? ???? ??? ??? ??? ?? ????? ???. 6??? ???? ??? ??? ???? ???? ??????? ?? ? ??? ?? ??? ???? ?????. ? ??? ???? ?? ??? ???? ?????.

    ## Set "date" as the index. See what that does?
    gdf_period.set_index("date", inplace=True)
    ## Now, resample by daylong intervals and check the max data during the resampled period. 
    ## Use .reset_index() to reset the index instead of date.
    gdf_day_max = gdf_period.resample('D').max().bfill().reset_index()
    gdf_day_max.head()

    ?? ???? ? ??? ??? ? ????. ?? ???? ??? ??? ??? ???? ?????.

    datenumber_stalatlonheight_staffhutyearmonthdayhourmins
    02017-02-018408600143.815.15672.08.198.0283.052017212354
    12017-02-028408600143.815.15672.014.198.0283.852017222354
    22017-02-038408600143.815.15672.010.199.0281.452017232354
    32017-02-048408600143.815.15672.012.599.0284.352017242354
    42017-02-058408600143.815.15672.07.399.0280.752017252354
    ? 3. ?????? ??? ??? ?? ?? ? ??

    3??. ?? ??? ?? ??

    ?? ???? ????? ??? ???? ???? ???????. ??? ?? ???? ???? ??? ???? ???? ???? ?? ????.

    ?? ???? ???? ?? 3? ?? ???? ?????. ???? ? ??? ???? ?????.

    # Specify the rolling window.
    gdf_3d_max = gdf_day_max.rolling('3d',min_periods=1).max()
    gdf_3d_max.reset_index(inplace=True)
    gdf_3d_max.head()

    ?? ???? ???? ??? ???? ???? ?? ??? ?? ??? ???? ??? ? ????.

    datenumber_stalatlonheight_staffhutyearmonthdayhourmins
    02017-02-018408600143.815.15672.08.198.0283.052017212354
    12017-02-028408600143.815.15672.014.198.0283.852017222354
    22017-02-038408600143.815.15672.014.199.0283.852017232354
    32017-02-048408600143.815.15672.014.199.0283.352017242354
    42017-02-058408600143.815.15672.012.599.0283.352017252354
    ? 4. ?? ??? ??? ?? ??? ??? ??? ?? 5?? ??? ???? ?? ??

    ? ?????? ??? ??? ??? ???? ??? ??? ?????. ????? ?? ??? ??? ??????, ? ??? ?? ??? ??? ???? ??? ? ????. ????? ???? ??? ?? ??? ?????.

    ?? ?? ??

    Meteonet ?? ??? ???? ?? ???? ???? ?, RAPIDS 23.02? ???? NVIDIA RTX A6000 GPU?? 13?? ?? ??? ??????(?? 1).

    Bar chart showing speedup results for data analysis performed on pandas and RAPIDS cuDF.
    ?? 1. Pandas? RAPIDS cuDF? ??? ??? ??? ??? ???? ??
    Pandas on CPU (Intel Core i7-7800X CPU)User: 2 min 32 sec
    Sys: 27.3 sec
    Total: 3 min
    RAPIDS cuDF on NVIDIA A6000 GPUsUser: 5.33 sec
    Sys: 8.67 sec
    Total: 14 sec
    ? 5. ?? ????? 12.8?? ?? ?? ??? ???? ?? ??

    ?? ???

    ??? ??? ?? ??? ?? ?? ??? ??? ??? ?? ?????. RAPIDs cuDF? ???? ??? Pandas ???? ?? ??? ? ??? ???? ????? ???? ??? ??? ? ????.

    ??? ???? cuDF? ?? ? ??? ????? GitHub? rapidsai-community/notebooks-contrib? ?????. EDA ???????? cuDF? ?? ????? ???? ??? ??: RAPIDS cuDF? ??? ?? ??? ????? ?????.

    3? 20??? 23??? ?? NVIDIA GTC 2023? ??? ???? ??? ????? ?????.

    ??

    Meiran Peng, David Taube

    ? ???? ??? SDK? ???? ?? ???, ?? ???, ?? ??, ??, ?? ??, ???? NVIDIA ??? ???? ??? ??? ??? ??? ? ????. ?? ??? ???? NVIDIA? ?? ????? ???? ? ??? ??? ??? ?????? ???? ??? ??? ???.

    Discuss (0)
    +1

    Tags

    ?? ???

    人人超碰97caoporen国产