A Python script to report status of local git repositories

A Python script that "scans through" a collection of local git repositories and generates a report for me.

What’s this for?

In a previous post I introduced some basic usages of git. I create git repos (short for repositories) for various purposes: each for a different research project, one for personal notes, one for management of my dotfiles (configuration files for Linux and different softwares), and some others for my side projects. It is not uncommon for one to maintain multiple local git repos in a single machine, and they may be located at different places in the file system. Overtime, the number of local repos grows and it starts to become difficult to keep track of the status of each one of them. Therefore I created a Python script that "scans through" a collection of local repos and generates a report for me.

The Python script

Paste the code first:

'''Go through a list of local git repos and check their status wrt respective
remotes.
'''
from __future__ import print_function
import os
import re
import subprocess

REPOS=[
        '~/.dotfiles',
        '~/Notebooks',
        '~/scripts/tools'
        ]

RE_TREECLEAN_PATTERN=re.compile(r'working tree clean')
RE_UP_TO_DATE_PATTERN=re.compile(r'up-to-date')
RE_UP_TO_DATE_PATTERN2=re.compile(r'up to date')
RE_BEHIND_PATTERN=re.compile(r'behind')
RE_AHEAD_PATTERN=re.compile(r'ahead')
RE_DIVERGED_PATTERN=re.compile(r'diverged')
RE_FETCHED_PATTERN=re.compile(r'From github.com:')


def subRun(cmd, repo, fail_list):
    '''Run a git command in subprocess

    Args:
        cmd (str): git command to run in shell.
        repo (str): abs path to the git repo.
        fail_list (list): a list to store repos when the command
            failed to execute.
    Returns:
        ret (str): the stdout message from subprocess.
        err (str): the stderr message from subprocess.
        fail_list (list): possibly modified input <fail_list>.
    '''

    try:
        proc=subprocess.Popen(cmd,shell=True,
                stdout=subprocess.PIPE,stderr=subprocess.PIPE)

        ret,err=proc.communicate()
        ret=ret.decode('utf-8')
        err=err.decode('utf-8')

        # successfull run of fetch will return '' as stdout if nothing fetched,
        # report of fetch (if any) as stderr.
        if 'fetch' in cmd:
            if len(err)>0:
                match_fetched=RE_FETCHED_PATTERN.findall(err)
                if len(match_fetched)>0:
                    print('\n# <git_summary>: New fetch in repo %s\n' %(repo))
                    print(err)
                    err=''
                else:
                    fail_list.append(repo)
                    print('\n# <git_summary>: Failed to run %s on repo %s' %(cmd,repo))
                    print('err of fetch:')
                    print(err)
        else:
            if len(err)>0:
                fail_list.append(repo)
                print('\n# <git_summary>: Failed to run %s on repo %s' %(cmd,repo))
                print('err of Popen:')
                print(err)
        return ret,err,fail_list

    except Exception as e:
        print('\n# <git_summary>: Failed to run %s on repo %s' %(cmd,repo))
        print('exception')
        print(e)
        return '','fail',fail_list

if __name__=='__main__':

    to_commit_list=[]
    ahead_list=[]
    behind_list=[]
    diverge_list=[]
    uptodate_list=[]
    tbd_list=[]  # failed repos

    #----------------Loop through repos----------------
    for repo in REPOS:

        print('\n# <git_summary>: Checking repo: %s' %repo)

        repo=os.path.expanduser(repo)
        os.chdir(repo)

        ret,err,tbd_list=subRun('git fetch',repo,tbd_list)
        if len(err)>0:
            continue

        ret,err,tbd_list=subRun('git status',repo,tbd_list)
        if len(err)>0:
            continue

        #-------------------Check staging area-------------------
        match_clean=RE_TREECLEAN_PATTERN.findall(ret)
        if len(match_clean)==0:
            to_commit_list.append(repo)
            continue

        #-----------Compare with tracked remote-----------
        match_utd=RE_UP_TO_DATE_PATTERN.findall(ret)
        match_utd2=RE_UP_TO_DATE_PATTERN2.findall(ret)
        if len(match_utd)>0 or len(match_utd2)>0:
            uptodate_list.append(repo)
            continue

        match_behind=RE_BEHIND_PATTERN.findall(ret)
        if len(match_behind)>0:
            behind_list.append(repo)
            continue

        match_ahead=RE_AHEAD_PATTERN.findall(ret)
        if len(match_ahead)>0:
            ahead_list.append(repo)
            continue

        match_diverged=RE_DIVERGED_PATTERN.findall(ret)
        if len(match_diverged)>0:
            diverge_list.append(repo)
            continue

        tbd_list.append(repo)

    #print('\n# ---------------- Summary --------------------')
    print('''
#######################################################################
#                               Summary                               #
#######################################################################
    ''')

    print('\n# ------- Repos need commit -------')
    for ii in to_commit_list:
        print('    * ',ii)

    print('\n# ------ Repos up to date with remote ------')
    for ii in uptodate_list:
        print('    * ',ii)

    print('\n# ------ Repos ahead of remote ------')
    for ii in ahead_list:
        print('    * ',ii)

    print('\n# ------ Repos behind remote ------')
    for ii in behind_list:
        print('    * ',ii)

    print('\n# ------ Failed or undetermined repos ------')
    for ii in tbd_list:
        print('    * ',ii)

    if len(ahead_list)>0 or len(behind_list)>0:
        print('\n\n# To view diff between local and remote, run:')
        print('\n    git diff master origin/master')

Break down of the script

At the top of the script, I define a global parameter REPOS which is a list of directory paths, each pointing to the location of a repo I maintain locally. I’m showing only 3 for brevity, your list maybe longer.

Note that I’m using the tilda symbol ~ to refer to the HOME directory. Mine is /home/guangzhi/, and yours will be different. Therefore using ~ makes the script more portable so that the same code works in my personal machine or the office one even if the their absolute HOME paths differ. Note that to work with the ~ symbol as a shorthand for HOME, you will need to "expand" it to get the actual path. This is done later using:

repo = os.path.expanduser(repo)

This will replace ~ with whatever path that points to your HOME directory.

After that, I define a few regular expression (regex) patterns, using Python’s built-in regex module re. These are used to determine the status of a repo by matching some keywords from the command line outputs from some git commands. For instance, when running a git status command, the following message will be returned if there is nothing changed in the repo and the staging area is clean:

The keywords I’m trying to capture in this instance is the phrase working tree clean. The absence of which indicates the existence of uncommitted changes in the repo, and some relevant information should be printed in the final report to inform me about this.

Take the RE_UP_TO_DATE_PATTERN as another example. The full message is Your branch is up to date with 'origin/master'., which can also be seen in the screenshot above. This tells me that my local branch matches the remote, otherwise it may be either beind or ahead of the remote, which would be captured by RE_BEHIND_PATTERN or RE_AHEAD_PATTERN. I noticed that, maybe due to version changes, I got both up-to-date and up to date from git st, therefore I added a second regex pattern to capture both.

Note that in order for such kind of local-remote comparisons to work, one needs to first do a git fetch to download the information from remote. Otherwise, how would git tell whether the local and remote match each other? This is why for each repo to check, the first command to run is:

ret, err, tbd_list = subRun('git fetch', repo, tbd_list)

This leads to the definition of the subRun() function. It is used to execute a git command, such as git status, in the shell. To call external programs from inside Python, one uses the subprocess module, e.g.

proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

where cmd is the command to execute, either as a complete string (such as 'git status' ) or a list of strings (such as ['git', 'status']). A subprocess.PIPE is assigned to stdout and stderr to capture the standard out and standard error messages like this:

ret, err = proc.communicate()

Then based on these returned messages, a regex pattern is applied on ret and/or err to determine the status of a repo. For instance, if nothing get fetched from the git fetch command, both stdout and stderr will be empty strings. If something get fetched, stdout will be '', but stderr will contain some messages like this:

The keywords I’m trying to capture is From github.com:, which is matched by RE_FETCHED_PATTERN. In case of matching, it will inform me about the new fetch and the repo name:

print('\n# <git_summary>: New fetch in repo %s\n' %(repo))
print(err)

Other than git fetch, non-empty stderr is treated as an indication of a failed execution of the command, maybe due to some network issues. In such cases, the repo name is appended to the fail_list list, which will be printed out so that I can re-check them later:

print('\n# ------ Failed or undetermined repos ------')
for ii in tbd_list:
	print('    * ',ii)

In the __main__ section of the script, I create some empty lists to store the results, before going into a loop through the repos defined in REPOS. For each repo, execute the git fetch command first to download updates, if any, from the remote. Then run the git status command on the repo. The stdout (ret) and/or stderr (err) from the command are matched against the regex patterns to determine the status of the repo, for instance, whether it has uncommitted changes, is lagging behind the remote etc..

Then a summary is printed out showing all the repos that belong the the same category, e.g. all those that contain uncommitted changes, all those that are lagging, or ahead of remote etc.. Lastly, I print out a message to remind myself of how to make a diff between local and remote repos if my local is ahead or behind remote:

if len(ahead_list)>0 or len(behind_list)>0:
	print('\n\n# To view diff between local and remote, run:')
	print('\n    git diff master origin/master')

Sample output

To make things easier for me, I created an alias in my .bashrc file:

alias gitck="python ~/.dotfiles/git_summary.py"

so that I can execute the Python script git_summary.py in the command line by typing gitck and Enter. Below is the output from my machine:

😷:tools$ gitck

# <git_summary>: Checking repo: ~/.dotfiles

# <git_summary>: Checking repo: ~/Notebooks

# <git_summary>: Checking repo: ~/scripts/tools

# <git_summary>: Checking repo: ~/scripts/project_03

# <git_summary>: Checking repo: ~/scripts/py_tctracker

# <git_summary>: Checking repo: ~/scripts/storm_tracker

# <git_summary>: Checking repo: ~/scripts/project_ar

# <git_summary>: Checking repo: ~/scripts/project_ar2

# <git_summary>: Checking repo: ~/scripts/fortran

# <git_summary>: Checking repo: ~/scripts/ml

#######################################################################
#                               Summary                               #
#######################################################################
    

# ------- Repos need commit -------
    *  /home/guangzhi/.dotfiles
    *  /home/guangzhi/Notebooks
    *  /home/guangzhi/scripts/project_ar

# ------ Repos up to date with remote ------
    *  /home/guangzhi/scripts/project_03
    *  /home/guangzhi/scripts/py_tctracker
    *  /home/guangzhi/scripts/storm_tracker
    *  /home/guangzhi/scripts/project_ar2
    *  /home/guangzhi/scripts/fortran
    *  /home/guangzhi/scripts/ml

# ------ Repos ahead of remote ------

# ------ Repos behind remote ------
    *  /home/guangzhi/scripts/tools

# ------ Failed or undetermined repos ------


# To view diff between local and remote, run:

    git diff master origin/master

Leave a Reply