Exploring Campus Labs Engage's data with Python
17 Sep 2021
Code to determine how many unique values there were within the typeName
field of records fetched from the “events” API endpoint:
import pandas
inputfolder = 'C:\\example\\'
eventsdf = pandas.read_json(inputfolder+'events.json', lines=True, dtype='object')
print(len(eventsdf['typeName'].unique()))
print(sorted(eventsdf['typeName'].unique()))
Code to determine how many unique values existed among the sublists of records attached within the categories
field of records fetched from the “events” API endpoint:
import pandas
import json
inputfolder = 'C:\\example\\'
with open(inputfolder+'events.json') as f:
eventslines = f.readlines()
categories_list = []
for line in eventslines:
lineobj = json.loads(line)
categories_array = lineobj.get('categories')
for category in categories_array:
categories_list.append(category)
categoriesdf = pandas.DataFrame(categories_list, dtype='object')
print(len(categoriesdf['categoryName'].unique()))
print(sorted(categoriesdf['categoryName'].unique()))
Code to figure out data patterns with the sublists of organization records attached within the hosts
field of records fetched from the “events” API endpoint:
import pandas
import json
inputfolder = 'C:\\example\\'
with open(inputfolder+'events.json') as f:
eventslines = f.readlines()
count_null_org_ids = 0
count_empty_host_lists = 0
count_lists_with_single_host_that_is_org_id = 0
count_lists_with_single_host_that_is_not_org_id = 0
count_lists_with_multiple_hosts = 0
for line in eventslines:
lineobj = json.loads(line)
org_id = lineobj.get('organizationId') # Looks like no org_id values are None.
if org_id is None:
count_null_org_ids += 1
hosts_array = lineobj.get('hosts') # Looks like no hosts_array lengths are 0.
if len(hosts_array) == 0:
count_empty_host_lists += 1
elif len(hosts_array) > 1:
count_lists_with_multiple_hosts += 1
else:
if org_id == hosts_array[0].get('organizationId'):
count_lists_with_single_host_that_is_org_id += 1
else:
count_lists_with_single_host_that_is_not_org_id += 1
print(f'''There are {count_null_org_ids} null org IDs.
Among {len(eventslines)} events, there are
{count_empty_host_lists} empty host lists,
{count_lists_with_single_host_that_is_not_org_id} abnormal single host lists,
{count_lists_with_single_host_that_is_org_id} events with a single host matching the org ID, and
{count_lists_with_multiple_hosts} events with multiple hosts.''')
Code to figure out how many characters are in the longest event name:
import pandas
inputfolder = 'C:\\example\\'
eventsdf = pandas.read_json(inputfolder+'events.json', lines=True, dtype='object')
print(eventsdf['eventName'].map(len).max())