What’s this for?
In a previous post I introduced some basic usages of git. I create git repos (short for repositories) for various purposes: each for a different research project, one for personal notes, one for management of my dotfiles (configuration files for Linux and different softwares), and some others for my side projects. It is not uncommon for one to maintain multiple local git repos in a single machine, and they may be located at different places in the file system. Overtime, the number of local repos grows and it starts to become difficult to keep track of the status of each one of them. Therefore I created a Python script that "scans through" a collection of local repos and generates a report for me.
The Python script
Paste the code first:
'''Go through a list of local git repos and check their status wrt respective
remotes.
'''
from __future__ import print_function
import os
import re
import subprocess
REPOS=[
        '~/.dotfiles',
        '~/Notebooks',
        '~/scripts/tools'
        ]
RE_TREECLEAN_PATTERN=re.compile(r'working tree clean')
RE_UP_TO_DATE_PATTERN=re.compile(r'up-to-date')
RE_UP_TO_DATE_PATTERN2=re.compile(r'up to date')
RE_BEHIND_PATTERN=re.compile(r'behind')
RE_AHEAD_PATTERN=re.compile(r'ahead')
RE_DIVERGED_PATTERN=re.compile(r'diverged')
RE_FETCHED_PATTERN=re.compile(r'From github.com:')
def subRun(cmd, repo, fail_list):
    '''Run a git command in subprocess
    Args:
        cmd (str): git command to run in shell.
        repo (str): abs path to the git repo.
        fail_list (list): a list to store repos when the command
            failed to execute.
    Returns:
        ret (str): the stdout message from subprocess.
        err (str): the stderr message from subprocess.
        fail_list (list): possibly modified input <fail_list>.
    '''
    try:
        proc=subprocess.Popen(cmd,shell=True,
                stdout=subprocess.PIPE,stderr=subprocess.PIPE)
        ret,err=proc.communicate()
        ret=ret.decode('utf-8')
        err=err.decode('utf-8')
        # successfull run of fetch will return '' as stdout if nothing fetched,
        # report of fetch (if any) as stderr.
        if 'fetch' in cmd:
            if len(err)>0:
                match_fetched=RE_FETCHED_PATTERN.findall(err)
                if len(match_fetched)>0:
                    print('\n# <git_summary>: New fetch in repo %s\n' %(repo))
                    print(err)
                    err=''
                else:
                    fail_list.append(repo)
                    print('\n# <git_summary>: Failed to run %s on repo %s' %(cmd,repo))
                    print('err of fetch:')
                    print(err)
        else:
            if len(err)>0:
                fail_list.append(repo)
                print('\n# <git_summary>: Failed to run %s on repo %s' %(cmd,repo))
                print('err of Popen:')
                print(err)
        return ret,err,fail_list
    except Exception as e:
        print('\n# <git_summary>: Failed to run %s on repo %s' %(cmd,repo))
        print('exception')
        print(e)
        return '','fail',fail_list
if __name__=='__main__':
    to_commit_list=[]
    ahead_list=[]
    behind_list=[]
    diverge_list=[]
    uptodate_list=[]
    tbd_list=[]  # failed repos
    #----------------Loop through repos----------------
    for repo in REPOS:
        print('\n# <git_summary>: Checking repo: %s' %repo)
        repo=os.path.expanduser(repo)
        os.chdir(repo)
        ret,err,tbd_list=subRun('git fetch',repo,tbd_list)
        if len(err)>0:
            continue
        ret,err,tbd_list=subRun('git status',repo,tbd_list)
        if len(err)>0:
            continue
        #-------------------Check staging area-------------------
        match_clean=RE_TREECLEAN_PATTERN.findall(ret)
        if len(match_clean)==0:
            to_commit_list.append(repo)
            continue
        #-----------Compare with tracked remote-----------
        match_utd=RE_UP_TO_DATE_PATTERN.findall(ret)
        match_utd2=RE_UP_TO_DATE_PATTERN2.findall(ret)
        if len(match_utd)>0 or len(match_utd2)>0:
            uptodate_list.append(repo)
            continue
        match_behind=RE_BEHIND_PATTERN.findall(ret)
        if len(match_behind)>0:
            behind_list.append(repo)
            continue
        match_ahead=RE_AHEAD_PATTERN.findall(ret)
        if len(match_ahead)>0:
            ahead_list.append(repo)
            continue
        match_diverged=RE_DIVERGED_PATTERN.findall(ret)
        if len(match_diverged)>0:
            diverge_list.append(repo)
            continue
        tbd_list.append(repo)
    #print('\n# ---------------- Summary --------------------')
    print('''
#######################################################################
#                               Summary                               #
#######################################################################
    ''')
    print('\n# ------- Repos need commit -------')
    for ii in to_commit_list:
        print('    * ',ii)
    print('\n# ------ Repos up to date with remote ------')
    for ii in uptodate_list:
        print('    * ',ii)
    print('\n# ------ Repos ahead of remote ------')
    for ii in ahead_list:
        print('    * ',ii)
    print('\n# ------ Repos behind remote ------')
    for ii in behind_list:
        print('    * ',ii)
    print('\n# ------ Failed or undetermined repos ------')
    for ii in tbd_list:
        print('    * ',ii)
    if len(ahead_list)>0 or len(behind_list)>0:
        print('\n\n# To view diff between local and remote, run:')
        print('\n    git diff master origin/master')
Break down of the script
At the top of the script, I define a global parameter REPOS which
is a list of directory paths, each pointing to the location of a
repo I maintain locally. I’m showing only 3 for brevity, your list
maybe longer.
Note that I’m using the tilda symbol ~ to refer to the
HOME directory. Mine is /home/guangzhi/, and yours will be
different. Therefore using ~ makes the script more portable so that
the same code works in my personal machine or the office one even if
the their absolute HOME paths differ. Note
that to work with the ~ symbol as a shorthand for HOME, you will
need to "expand" it to get the actual path. This is done later using:
repo = os.path.expanduser(repo)
This will replace ~ with whatever path that points to your HOME
directory.
After that, I define a few regular expression (regex) patterns,
using Python’s built-in regex module re. These are used to
determine the status of a repo by matching some keywords from the
command line outputs from some git commands. For instance, when
running a git status command, the following message will be returned
if there is nothing changed in the repo and the staging area is
clean:


The keywords I’m trying to capture in this instance is the phrase
working tree clean. The absence of which indicates the existence of
uncommitted changes in the repo, and some relevant information should
be printed in the final report to inform me about this.
Take the RE_UP_TO_DATE_PATTERN as another example. The full message
is Your branch is up to date with 'origin/master'., which can also be
seen in the screenshot above. This tells me that my local branch matches
the remote, otherwise it may be either beind or ahead
of the remote, which would be captured by RE_BEHIND_PATTERN or
RE_AHEAD_PATTERN. I noticed that, maybe due to version
changes, I got both up-to-date and up to date from git st,
therefore I added a second regex pattern to capture both.
Note that in order for such kind of local-remote comparisons to work,
one needs to first do a git fetch to download the information from
remote. Otherwise, how would git tell whether the local and remote
match each other? This is why for each repo to check, the first command
to run is:
ret, err, tbd_list = subRun('git fetch', repo, tbd_list)
This leads to the definition of the subRun() function. It is used to
execute a git command, such as git status, in the shell. To call
external programs from inside Python, one uses the subprocess
module, e.g.
proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
where cmd is the command to execute, either as a complete string
(such as 'git status' ) or a list of strings (such as ['git', 'status']). A subprocess.PIPE is assigned to stdout and stderr
to capture the standard out and standard error messages like this:
ret, err = proc.communicate()
Then based on these returned messages, a regex pattern is
applied on ret and/or err to determine the status of a repo.
For instance, if nothing get fetched from the git fetch command,
both stdout and stderr will be empty strings. If something get
fetched, stdout will be '', but stderr will contain some messages
like this:


The keywords I’m trying to capture is From github.com:, which is
matched by RE_FETCHED_PATTERN. In case of matching, it will inform
me about the new fetch and the repo name:
print('\n# <git_summary>: New fetch in repo %s\n' %(repo))
print(err)
Other than git fetch, non-empty stderr is treated as an indication
of a failed execution of the command, maybe due to some network issues.
In such cases, the repo name is appended to the fail_list list,
which will be printed out so that I can re-check them later:
print('\n# ------ Failed or undetermined repos ------')
for ii in tbd_list:
	print('    * ',ii)
In the __main__ section of the script, I create some empty lists to
store the results, before going into a loop through the repos defined in
REPOS. For each repo, execute the git fetch command first to
download updates, if any, from the remote. Then run the git status
command on the repo. The stdout (ret) and/or stderr (err) from
the command are matched against the regex patterns to determine the
status of the repo, for instance, whether it has uncommitted changes,
is lagging behind the remote etc..
Then a summary is printed out showing all the repos that belong the the same category, e.g. all those that contain uncommitted changes, all those that are lagging, or ahead of remote etc.. Lastly, I print out a message to remind myself of how to make a diff between local and remote repos if my local is ahead or behind remote:
if len(ahead_list)>0 or len(behind_list)>0:
	print('\n\n# To view diff between local and remote, run:')
	print('\n    git diff master origin/master')
Sample output
To make things easier for me, I created an alias in my .bashrc
file:
alias gitck="python ~/.dotfiles/git_summary.py"
so that I can execute the Python script git_summary.py in the
command line by typing gitck and Enter. Below is the output from
my machine:
😷:tools$ gitck
# <git_summary>: Checking repo: ~/.dotfiles
# <git_summary>: Checking repo: ~/Notebooks
# <git_summary>: Checking repo: ~/scripts/tools
# <git_summary>: Checking repo: ~/scripts/project_03
# <git_summary>: Checking repo: ~/scripts/py_tctracker
# <git_summary>: Checking repo: ~/scripts/storm_tracker
# <git_summary>: Checking repo: ~/scripts/project_ar
# <git_summary>: Checking repo: ~/scripts/project_ar2
# <git_summary>: Checking repo: ~/scripts/fortran
# <git_summary>: Checking repo: ~/scripts/ml
#######################################################################
#                               Summary                               #
#######################################################################
    
# ------- Repos need commit -------
    *  /home/guangzhi/.dotfiles
    *  /home/guangzhi/Notebooks
    *  /home/guangzhi/scripts/project_ar
# ------ Repos up to date with remote ------
    *  /home/guangzhi/scripts/project_03
    *  /home/guangzhi/scripts/py_tctracker
    *  /home/guangzhi/scripts/storm_tracker
    *  /home/guangzhi/scripts/project_ar2
    *  /home/guangzhi/scripts/fortran
    *  /home/guangzhi/scripts/ml
# ------ Repos ahead of remote ------
# ------ Repos behind remote ------
    *  /home/guangzhi/scripts/tools
# ------ Failed or undetermined repos ------
# To view diff between local and remote, run:
    git diff master origin/master
		


