import csv
import os
import sys
import pandas as pd
DATA_DIR = '/Users/adam/downloads/'
csv.field_size_limit(sys.maxsize)
projects = {}
for filename in os.listdir(DATA_DIR):
if not filename.endswith('-classifications.csv'):
continue
classifications = pd.read_csv(os.path.join(DATA_DIR, filename))
users = projects.setdefault(
filename.replace('-classifications.csv', ''),
set(map(int, classifications[classifications.user_id.notnull()].user_id.unique()))
)
print('{}: {} users'.format(filename, len(users)))
sources = []
targets = []
counts = []
for project, users in projects.items():
for other_project, other_users in projects.items():
if project is other_project:
continue
if other_project in sources:
continue
common_users = len(users & other_users)
sources.append(project)
targets.append(other_project)
counts.append(common_users)
data = pd.DataFrame({'sources': sources, 'targets': targets, 'counts': counts})
import holoviews as hv
hv.extension('bokeh')
%opts Chord [color_index='index' label_index='index' edge_color_index='sources' width=800 height=800]
%opts Chord (cmap='Category20' edge_cmap='Category20')
graph = hv.Chord(data)
graph