pythonaggregategroupingdata-analysisexploratory-data-analysis

I want to find the maximum number for each year of each country, but i'm stuck


I am diving into a data set, I have cleaned and now I want to display the year each country had the maximum subscribers but I'm stuck

Tried this:

netf.query('country != "unspecified"')\
    .groupby(['country','created_year'], sort=True)['subscribers'].sum()

and got this:

country               created_year
Andorra               2006.0            15100000
Argentina             2006.0            23000000
                      2011.0            58400000
                      2013.0            71300000
                      2014.0            40000000
                      2015.0            77700000
                      2016.0            37500000
                      2017.0            20600000
Australia             2009.0            18000000
                      2011.0            42000000
                      2013.0            42600000
                      2014.0            17200000
                      2015.0            34600000
Bangladesh            2021.0            13900000
Barbados              2005.0            41900000
Brazil                2006.0            97000000
                      2007.0            18500000
.....

but I'm now stuck and don't know how to proceed.

P.s. I'm new to python programming

here's the top 20 rows: netf.head(20).to_dict('list'):

{'youtuber': ['T-Series',
  'YouTube Movies',
  'MrBeast',
  'Cocomelon - Nursery Rhymes',
  'SET India',
  'ýýý Kids Diana Show',
  'PewDiePie',
  'Like Nastya',
  'Vlad and Niki',
  'Zee Music Company',
  'WWE',
  'BLACKPINK',
  'Goldmines',
  'Sony SAB',
  '5-Minute Crafts',
  'BANGTANTV',
  'Sports',
  'Justin Bieber',
  'HYBE LABELS',
  'Zee TV'],
 'subscribers': [245000000,
  170000000,
  166000000,
  162000000,
  159000000,
  112000000,
  111000000,
  106000000,
  98900000,
  96700000,
  96000000,
  89800000,
  86900000,
  83000000,
  80100000,
  75600000,
  75000000,
  71600000,
  71300000,
  70500000],
 'views': [228000000000,
  0,
  28368841870,
  164000000000,
  148000000000,
  93247040539,
  29058044447,
  90479060027,
  77180169894,
  57856289381,
  77428473662,
  32144597566,
  24118230580,
  101000000000,
  26236790209,
  20826993957,
  0,
  30608119724,
  28634566938,
  73139054467],
 'category': ['Music',
  'Film & Animation',
  'Entertainment',
  'Education',
  'Shows',
  'People & Blogs',
  'Gaming',
  'People & Blogs',
  'Entertainment',
  'Music',
  'Sports',
  'People & Blogs',
  'Film & Animation',
  'Shows',
  'Howto & Style',
  'Music',
  'unknown',
  'Music',
  'Music',
  'Entertainment'],
 'title': ['T-Series',
  'youtubemovies',
  'MrBeast',
  'Cocomelon - Nursery Rhymes',
  'SET India',
  'ýýý Kids Diana Show',
  'PewDiePie',
  'Like Nastya Vlog',
  'Vlad and Niki',
  'Zee Music Company',
  'WWE',
  'BLACKPINK',
  'goldmines',
  'Sony SAB',
  '5-Minute Crafts 2.0',
  'BANGTANTV',
  'sports',
  'Justin Bieber',
  'HYBE LABELS',
  'Zee TV'],
 'uploads': [20082,
  1,
  741,
  966,
  116536,
  1111,
  4716,
  493,
  574,
  8548,
  70127,
  543,
  1,
  71270,
  1,
  2281,
  3,
  249,
  1337,
  129204],
 'country': ['India',
  'United States',
  'United States',
  'United States',
  'India',
  'United States',
  'Japan',
  'Russia',
  'United States',
  'India',
  'United States',
  'South Korea',
  'unspecified',
  'India',
  'United Kingdom',
  'South Korea',
  'United States',
  'Canada',
  'South Korea',
  'India'],
 'channel_type': ['Music',
  'Games',
  'Entertainment',
  'Education',
  'Entertainment',
  'Entertainment',
  'Entertainment',
  'People',
  'Entertainment',
  'Music',
  'Sports',
  'Music',
  'Music',
  'Entertainment',
  'Entertainment',
  'Music',
  'Entertainment',
  'Music',
  'Music',
  'Entertainment'],
 'video_views_rank': ['1.0',
  '4055159.0',
  '48.0',
  '2.0',
  '3.0',
  '5.0',
  '44.0',
  '630.0',
  '8.0',
  '12.0',
  '7.0',
  '32.0',
  '4056562.0',
  '4.0',
  '4057901.0',
  '112.0',
  '3898122.0',
  '38.0',
  '46.0',
  '9.0'],
 'country_rank': ['1.0',
  '7670.0',
  '1.0',
  '2.0',
  '2.0',
  '3.0',
  '1.0',
  '5.0',
  '5.0',
  '3.0',
  '6.0',
  '1.0',
  '0.0',
  '5.0',
  '4797.0',
  '2.0',
  '6266.0',
  '1.0',
  '3.0',
  '6.0'],
 'channel_type_rank': ['1.0',
  '7423.0',
  '1.0',
  '1.0',
  '2.0',
  '3.0',
  '4.0',
  '25.0',
  '6.0',
  '2.0',
  '1.0',
  '3.0',
  '5663.0',
  '7.0',
  '6781.0',
  '4.0',
  '5395.0',
  '6.0',
  '5.0',
  '8.0'],
 'last_monthly_views': [2258000000,
  12,
  1348000000,
  1975000000,
  1824000000,
  731674000,
  39184000,
  48947000,
  580574000,
  803613000,
  714614000,
  498930000,
  18,
  1657000000,
  1,
  168290000,
  16,
  176326000,
  598173000,
  1707000000],
 'lowest_monthly_earnings': [564600.0,
  0.0,
  337000.0,
  493800.0,
  455900.0,
  182900.0,
  9800.0,
  12200.0,
  145100.0,
  200900.0,
  178700.0,
  124700.0,
  0.0,
  414300.0,
  0.0,
  42100.0,
  0.0,
  44100.0,
  149500.0,
  426800.0],
 'highest_monthly_earnings': [9000000.0,
  0.05,
  5400000.0,
  7900000.0,
  7300000.0,
  2900000.0,
  156700.0,
  195800.0,
  2300000.0,
  3200000.0,
  2900000.0,
  2000000.0,
  0.07,
  6600000.0,
  0.0,
  673200.0,
  0.06,
  705300.0,
  2400000.0,
  6800000.0],
 'lowest_yearly_earnings': [6800000.0,
  0.04,
  4000000.0,
  5900000.0,
  5500000.0,
  2200000.0,
  117600.0,
  146800.0,
  1700000.0,
  2400000.0,
  2100000.0,
  1500000.0,
  0.05,
  5000000.0,
  0.0,
  504900.0,
  0.05,
  529000.0,
  1800000.0,
  5100000.0],
 'highest_yearly_earnings': [108400000.0,
  0.58,
  64700000.0,
  94800000.0,
  87500000.0,
  35100000.0,
  1900000.0,
  2300000.0,
  27900000.0,
  38600000.0,
  34300000.0,
  23900000.0,
  0.86,
  79600000.0,
  0.05,
  8100000.0,
  0.77,
  8500000.0,
  28700000.0,
  81900000.0],
 'created_year': ['2006.0',
  '2006.0',
  '2012.0',
  '2006.0',
  '2006.0',
  '2015.0',
  '2010.0',
  '2016.0',
  '2018.0',
  '2014.0',
  '2007.0',
  '2016.0',
  '2006.0',
  '2007.0',
  '2020.0',
  '2012.0',
  '2006.0',
  '2007.0',
  '2008.0',
  '2005.0'],
 'created_month': ['Mar',
  'Mar',
  'Feb',
  'Sep',
  'Sep',
  'May',
  'Apr',
  'Jan',
  'Apr',
  'Mar',
  'May',
  'Jun',
  'Aug',
  'Aug',
  'Jul',
  'Dec',
  'Jan',
  'Jan',
  'Jun',
  'Dec'],
 'created_date': ['13.0',
  '5.0',
  '20.0',
  '1.0',
  '20.0',
  '12.0',
  '29.0',
  '14.0',
  '23.0',
  '12.0',
  '11.0',
  '29.0',
  '15.0',
  '4.0',
  '27.0',
  '17.0',
  '30.0',
  '15.0',
  '4.0',
  '11.0']}

Solution

  • Assuming a DataFrame, you can use:

    out = (netf
        .query('country != "unspecified"')
        .groupby(['country','created_year'], sort=True)['subscribers'].sum()
        .reset_index('country').groupby('country')
        ['subscribers'].idxmax()
    )
    

    Or using a crosstab and idxmax:

    out = pd.crosstab(netf['country'], netf['created_year'],
                      netf['subscribers'], aggfunc='sum').idxmax(axis=1)
    

    Output:

    country
    Andorra       2006.0
    Argentina     2015.0
    Australia     2013.0
    Bangladesh    2021.0
    Barbados      2005.0
    Brazil        2006.0
    Name: subscribers, dtype: float64
    

    With the amount of subscribers:

    tmp = (netf
        .query('country != "unspecified"')
        .groupby(['country','created_year'], sort=True)['subscribers'].sum()
        .reset_index('country')
    )
    
    out = tmp.loc[tmp.groupby('country')['subscribers'].idxmax()]