azure-databricks

azure databricks count rows in all tables - is there a better way


I'm trying to find the best way to get row counts for all my databricks tables. This is what I came up with:

for row in dvdbs.rdd.collect():
   tmp = "show tables from " + row['databaseName'] + " like 'xxx*'"
   if row['databaseName'] == 'default':
       dftbls = sqlContext.sql(tmp)
   else:
     dftbls = dftbls.union(sqlContext.sql(tmp))
tmplist = []
for row in dftbls.rdd.collect():
    tmp = 'select * from ' + row['database'] + '.' + row['tableName']
    tmpdf = sqlContext.sql(tmp)
    tmplist.append((row['database'], row['tableName'],tmpdf.count()))
columns =  ['database', 'tableName', 'rowCount']     
df = spark.createDataFrame(tmplist, columns)    
display(df)

Solution

  • I found this to be significantly faster...

    dftbl = sqlContext.sql("show tables")
    dfdbs = sqlContext.sql("show databases")
    for row in dfdbs.rdd.collect():
        tmp = "show tables from " + row['databaseName'] 
        if row['databaseName'] == 'default':
            dftbls = sqlContext.sql(tmp)
        else:
           dftbls = dftbls.union(sqlContext.sql(tmp))
    tmplist = []
    for row in dftbls.rdd.collect():
        try:
          tmp = 'select count(*) myrowcnt from ' + row['database'] + '.' + row['tableName']
          tmpdf = sqlContext.sql(tmp)
          myrowcnt= tmpdf.collect()[0]['myrowcnt'] 
          tmplist.append((row['database'], row['tableName'],myrowcnt))
        except:
          tmplist.append((row['database'], row['tableName'],-1))
    
    columns =  ['database', 'tableName', 'rowCount']     
    df = spark.createDataFrame(tmplist, columns)    
    display(df)