DBLP Parser

DBLP raw data is a ~500MB .xml file.
A Simple Parser is given by DBLP but it wasn't designed to fit our database structure.
In order to extract the neccessary data we wrote a parser, which best fit our needs, and produces output in the desired format.
The parser is written in C# and uses some of latest .NET 3.5 features such as LINQ to XML.
When processing is done, the data is then inserted to MySQL server.

Source Code

The main entry point of the program:

class Program
    {
        static string authorToBookFilePath;
        static string sharedBooksFilePath;
        static string authorToIdFilePath;
        static string authorToHomePageFilePath;
 
        static void Main(string[] args)
        {
            sharedBooksFilePath = args[0];
            authorToBookFilePath = args[1];
            authorToIdFilePath = args[2];
            authorToHomePageFilePath = args[3];
 
            Dictionary<Partners, int> sharedRepository = new Dictionary<Partners, int>(1000000);
            Dictionary<Author, int> authorToId = new Dictionary<Author, int>();
            List<Dictionary<string, string>> authorToBook = new List<Dictionary<string,string>>();
 
            XmlReaderSettings settings = new XmlReaderSettings();
            settings.ProhibitDtd = false;
            settings.ValidationType = ValidationType.DTD;
            XmlReader reader = XmlReader.Create(@"C:\Users\Michal\Documents\Visual Studio 2008\Projects\DBLPParser\DBLPParser\dblp.xml", settings);
 
            int globalAuthorsCounter = 0;
            while (!reader.EOF)
            {
                reader.MoveToContent();
                if (reader.Depth != 1)
                {
                    reader.Read();
                    continue;
                }
 
                XDocument xDocFromNode = new XDocument(XDocument.ReadFrom(reader));
                var authorsInNode = (from authors in xDocFromNode.Descendants("author")
                                    select authors.Value).ToList();
                var title = xDocFromNode.Descendants("title").Select(e => e.Value).FirstOrDefault();
 
                for (int i = 0; i < authorsInNode.Count; i++)
                {
                    Author firstAuthorName = new Author(authorsInNode[i], null);
 
                    // Add id to the author
                    if (!authorToId.ContainsKey(firstAuthorName))
                    {
                        authorToId.Add(firstAuthorName, globalAuthorsCounter);
                        globalAuthorsCounter++;
                    }
 
                    Dictionary<string,string> authorAndBook = new Dictionary<string, string>();
                    if (!title.Equals("Home Page"))
                    {
                        // If it's a regular publication
                        authorAndBook.Add(firstAuthorName.Name, title);
                        authorToBook.Add(authorAndBook);
                    }
                    else
                    {
                        // it's the author homepage
                        var url = xDocFromNode.Descendants("url").Select(e => e.Value).FirstOrDefault();
                        // Get the relevant author from the list authorToId
                        Author relevantAuthor = authorToId.Keys.Where(author => author.Name.Equals(firstAuthorName.Name)).FirstOrDefault();
                        if (relevantAuthor != null)
                        {
                            // Update author's homepage
                            int id = authorToId[firstAuthorName];
                            relevantAuthor.HomePage = url;
                            authorToId.Remove(firstAuthorName);
                            authorToId.Add(relevantAuthor, id);
                        }                        
                    }
 
                    // Construct all authors pairs
                    for (int j = i + 1; j < authorsInNode.Count; j++)
                    {
                        string secondAuthorName = authorsInNode[j];
                        try
                        {
                            Partners partners = new Partners(firstAuthorName.Name, secondAuthorName);
                            int sharedBooksCounter;
                            if (!sharedRepository.TryGetValue(partners, out sharedBooksCounter))
                            {
                                sharedRepository.Add(partners, 1);
                            }
                            else
                            {
                                sharedRepository[partners]++;
                            }
                        }
                        catch (ApplicationException)
                        {
                            continue;
                        }
                    }
                }
            }
 
            // Write to output to file
            BuildSharedBooksOutput(sharedRepository);
            BuildAuthorToId(authorToId);
            BuildAuthorsBooks(authorToBook);
        }
}

Author - Represents a single author entity

class Author : IEquatable<Author>
    {
        private string m_name;
        private string m_homePage;
 
        public string Name
        {
            get { return m_name; }
            set { m_name = value; }
        }       
 
        public string HomePage
        {
            get { return m_homePage; }
            set { m_homePage = value; }
        }
 
        public Author(string name, string homePage)
        {
            m_name = name;
            m_homePage = homePage;
        }
 
        public bool Equals(Author other)
        {
            return other.Name.Equals(this.m_name);
        }
 
        public override bool Equals(object obj)
        {
            return this.Equals(obj as Author);
        }
 
        public override int GetHashCode()
        {
            return m_name.GetHashCode();
        }
 
    }

Partners - Represents unique pair of two co-authors which publish at least one article together.

class Partners : IEquatable<Partners>
    {
        private string m_partner1;
        private string m_partner2;
 
        public string Partner1
        {
            get {return m_partner1;}        
        }
 
        public string Partner2
        {
            get {return m_partner2;}        
        }
 
        public Partners(string partner1, string partner2)
        {
            if (partner1.Equals(partner2, StringComparison.CurrentCultureIgnoreCase))
            {
                throw new ApplicationException("Two coworkers can not be identical.");
            }
            m_partner1 = partner1;
            m_partner2 = partner2;
        }
 
        public bool Equals(Partners other)
        {
            bool first = other.Partner1.Equals(this.Partner1, StringComparison.CurrentCultureIgnoreCase) ||
                other.Partner1.Equals(this.Partner2, StringComparison.CurrentCultureIgnoreCase);
 
            bool second = other.Partner2.Equals(this.Partner1, StringComparison.CurrentCultureIgnoreCase) ||
                other.Partner2.Equals(this.Partner2, StringComparison.CurrentCultureIgnoreCase);
 
            return first && second;
        }
 
        public override bool Equals(object obj)
        {
            return this.Equals(obj as Partners);
        }
 
        public override int GetHashCode()
        {
            return m_partner1.GetHashCode() ^ m_partner2.GetHashCode();
        }
 
    }
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License