Most influential companies on github

TL;DR; Google is the most influential company on Github, but judging by the number of employees, Facebook fares better. As expected, javascript is the most used language.

To calculate the most influential companies we calculate the number of repositories multiplied by the number of stars received.

Show the code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#get the data

import requests
import csv

data = []
head = {'Authorization': 'token code'}

def saveData(body, company):
    for item in body["items"]:
        data.append([company, item["full_name"], item["stargazers_count"], item["watchers_count"], item["language"]])

def getNextUrl(headers):
    links = headers.get('link', None)
    if links is not None:
        individualLinks = links.split(",")
        firstLink = individualLinks[0].split(";")
        if "next" in firstLink[1]:
            nextPageUrl = firstLink[0][1:-1]
            return nextPageUrl
        else:
            return None
    return None

def getData(url):
    response = requests.get(url, headers=head)
    return response

def queryApi(url, company):
    content = getData(url)
    saveData(content.json(), company)
    nextUrl = getNextUrl(content.headers)
    if nextUrl:
        return queryApi(nextUrl, company)
    else:
        return None

company_lists = {
    "google" : ["google", "googlesamples"],
    "facebook" : ["facebook"],
    "apache" : ["apache"],
    "microsoft" : ["microsoft"],
    "mozilla" : ["mozilla"],
    "apple": ["apple"],
    "amazon": ["amzn", "amazonwebservices", "aws"]
}

for key, value in company_lists.items():
    for company in value:
        queryApi("https://api.github.com/search/repositories?q=org:{}&type=Repositories&per_page=100".format(company), key)

with open('../data/github-companies/companies2.csv', 'w') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(['Company', 'Repository', 'Stars', 'Watchers', 'Language'])
    for x in data:
        wr.writerow(x)

Google is the most influential company on Github, followed up by Facebook, Microsoft, Apache and Mozilla. Although not in the top, we included Amazon and Apple to see how the other two big tech companies are doing:

Show the code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
        
#calculate most influential
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime

%matplotlib inline

data = pd.read_csv(
    "data/github-companies/companies.csv"
)
stars = data[["Company", "Stars"]].copy()
stars_count = stars.groupby(["Company"]).sum().sort_values("Stars", ascending=False)

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%  ({v:,})'.format(p=pct,v=val)
    return my_autopct
    
explode = (0.07, 0, 0, 0, 0, 0, 0)
stars_count.plot(kind="pie", y="Stars", autopct=make_autopct(stars_count["Stars"]),
                 explode=explode, shadow=True, startangle=140, figsize=(12,12), title="Total number of stars per company with percentage")
plt.ylabel('')
plt.savefig('data/github-companies/stars.png')

Company	Stars
google	690623
facebook	552130
microsoft	296351
apache	199440
mozilla	122640
apple	72066
amazon	44504

But Google is much bigger than Facebook. So it would be fairer to judge the company by the number of employees. Github stars per capita is in Facebook's favour, suggesting that the company gives more to open source given its size:

Show the code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
employees = {
    "google": 72053,
    "facebook" : 20658,
    "microsoft" : 124000,
    "mozilla" : 1050
}
stars_with_employees = stars_count.iloc[pd.np.r_[0:3, 4:5]].reset_index()
stars_with_employees["Employees"] = stars_with_employees["Company"].map(employees)
stars_with_employees["stars_per_employee"] = stars_with_employees.apply(lambda x: round(x.Stars / x.Employees,1), axis = 1)
stars_with_employees.sort_values("stars_per_employee", ascending=False)

explode = (0, 0.1, 0, 0)
stars_with_employees[["stars_per_employee", "Company"]].set_index("Company").plot(kind="pie", y="stars_per_employee", autopct=make_autopct(stars_with_employees["stars_per_employee"]),
                 explode=explode, shadow=True, startangle=140, figsize=(12,12), title="Github stars per employee - percentage and total value")
plt.ylabel('')
plt.savefig('data/github-companies/stars_per_employee.png')

The majority of the work these companies do is done on the javascript ecosystem. Java, C++ and Python are also receiving a lot of support. Facebook is investing a lot in Ocaml, Apache in Scala and Microsoft in Typescript.

Show the code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
language = data[["Company", "Language", "Stars"]].copy()

def filter_by_company(df, company):
    return df[df["Company"] == company].copy()

def group_by_language_stars(df):
    return df.groupby(["Language"]).sum().sort_values("Stars", ascending=False)

def languages_pie(df, company):
    explode = (0.1, 0, 0, 0, 0)
    df[~df.index.isin(["HTML", "CSS"])].iloc[:5].plot(kind="pie", y="Stars", autopct=make_autopct(df["Stars"]),
                     explode=explode, shadow=True, startangle=140, figsize=(12,12), title="Top languages used on github at $\\bf{}$ (multiplied by number of stars)".format(company))
    plt.ylabel('')
    plt.savefig('data/github-companies/languages_at_{}.png'.format(company))

company_lists = ["google", "facebook", "apache", "microsoft", "mozilla", "apple", "amazon"]

for company in company_lists:
    (language.pipe(filter_by_company, company=company)
        .pipe(group_by_language_stars)
        .pipe(languages_pie, company)
    )

Full code here.