<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">608043</article-id>
<article-id pub-id-type="doi">10.3389/fdata.2020.608043</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Proximity-Based Compression for Network Embedding</article-title>
<alt-title alt-title-type="left-running-head">Islam et al.</alt-title>
<alt-title alt-title-type="right-running-head">Proximity-Based Compression for Network Embedding</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Islam</surname>
<given-names>Muhammad Ifte</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1091844/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Tanvir</surname>
<given-names>Farhan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1187234/bio"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Johnson</surname>
<given-names>Ginger</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1100044/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Akbas</surname>
<given-names>Esra</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1033214/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Aktas</surname>
<given-names>Mehmet Emin</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1187233/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Department of Computer Science, Oklahoma State University, <addr-line>Stillwater</addr-line>, <addr-line>OK</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Department of Computer Science, University of Tulsa, <addr-line>Tulsa</addr-line>, <addr-line>OK</addr-line>, <country>United States</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Department of Mathematics and Statistics, University of Central Oklahoma, <addr-line>Edmond</addr-line>, <addr-line>OK</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/938992/overview">B. Aditya Prakash</ext-link>, Georgia Institute of Technology, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/751076/overview">Remy Cazabet</ext-link>, Universit&#xe9; de Lyon, France</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/961930/overview">Pei Yang</ext-link>, South China University of Technology, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Muhammad Ifte Islam, <email>ifte.islam@okstate.edu</email>; Esra Akbas, <email>eakbas@okstate.edu</email>
</corresp>
<fn>
<p>This article was submitted to Data Mining and Management, a section of the journal Frontiers in Big Data</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>26</day>
<month>01</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2020</year>
</pub-date>
<volume>3</volume>
<elocation-id>608043</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>09</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>07</day>
<month>12</month>
<year>2020</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Islam, Tanvir, Johnson, Akbas and Aktas.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Islam, Tanvir, Johnson, Akbas and Aktas</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="608043.pdf"/>
<abstract>
<p>Network embedding that encodes structural information of graphs into a low-dimensional vector space has been proven to be essential for network analysis applications, including node classification and community detection. Although recent methods show promising performance for various applications, graph embedding still has some challenges; either the huge size of graphs may hinder a direct application of the existing network embedding method to them, or they suffer compromises in accuracy from locality and noise. In this paper, we propose a novel <bold>N</bold>etwork <bold>E</bold>mbedding method, NECL, to generate embedding more efficiently or effectively. Our goal is to answer the following two questions: 1) Does the network <bold>C</bold>ompression significantly boost <bold>L</bold>earning? 2) Does network compression improve the quality of the representation? For these goals, first, we propose a novel graph compression method based on the neighborhood similarity that compresses the input graph to a smaller graph with incorporating local proximity of its vertices into super-nodes; second, we employ the compressed graph for network embedding instead of the original large graph to bring down the embedding cost and also to capture the global structure of the original graph; third, we refine the embeddings from the compressed graph to the original graph. NECL is a general meta-strategy that improves the efficiency and effectiveness of many state-of-the-art graph embedding algorithms based on node proximity, including DeepWalk, Node2vec, and LINE. Extensive experiments validate the efficiency and effectiveness of our method, which decreases embedding time and improves classification accuracy as evaluated on single and multi-label classification tasks with large real-world graphs.</p>
</abstract>
<kwd-group>
<kwd>network embedding</kwd>
<kwd>graph representation learning</kwd>
<kwd>graph compression</kwd>
<kwd>graph classification</kwd>
<kwd>node similarity</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title> Introduction</title>
<p>Networks are effectively used to represent relationships and dependence among data. Node classification, community detection, and link prediction are some of the applications of network analysis in many different areas such as social networks and biological networks. On the other hand, there are some challenges in network analysis, such as high computational complexity, low parallelizability, and inapplicability of machine learning methods (<xref ref-type="bibr" rid="B14">Cui et al., 2018</xref>). Recently, network embedding as representation learning from graph has become popular for many problems in network analysis (<xref ref-type="bibr" rid="B19">Hamilton et al., 2017</xref>; <xref ref-type="bibr" rid="B35">Zhang et al., 2017</xref>; <xref ref-type="bibr" rid="B8">Cai et al., 2018</xref>; <xref ref-type="bibr" rid="B14">Cui et al., 2018</xref>; <xref ref-type="bibr" rid="B17">Goyal and Ferrara, 2018</xref>). Network embedding is defined as encoding structural information of graphs into a low-dimensional vector space on their connections (<xref ref-type="bibr" rid="B25">Perozzi et al., 2014</xref>). By preserving structure information of the network, nodes with links will be close to each other in vector space. While desirable network embedding methods for real-world networks should preserve the local proximity between vertices and the global structure of the graph, it should also be scalable for large networks (<xref ref-type="bibr" rid="B30">Tang et al., 2015</xref>).</p>
<p>While early methods, which consider the network embedding as a dimensionality reduction (<xref ref-type="bibr" rid="B7">Belkin and Niyogi, 2001</xref>), are effective on small graphs, the major concern of them is that time complexity is at least quadratic in the number of graph vertices. Therefore, it is not possible to apply them on large-scale networks with billions of vertices (<xref ref-type="bibr" rid="B35">Zhang et al., 2017</xref>; <xref ref-type="bibr" rid="B8">Cai et al., 2018</xref>; <xref ref-type="bibr" rid="B14">Cui et al., 2018</xref>). In recent years, more scalable methods that use matrix factorization or neural networks have been proposed with transforming the network embedding problem into an optimization problem (<xref ref-type="bibr" rid="B30">Tang et al., 2015</xref>). DeepWalk (<xref ref-type="bibr" rid="B25">Perozzi et al., 2014</xref>) is the pioneering work that uses the idea of word representation learning (<xref ref-type="bibr" rid="B21">Mikolov et al., 2013a</xref>; <xref ref-type="bibr" rid="B22">Mikolov et al., 2013b</xref>) for network embedding. They preserve network structures or local neighborhood proximity with path sampling using short random walks (<xref ref-type="bibr" rid="B25">Perozzi et al., 2014</xref>; <xref ref-type="bibr" rid="B18">Grover and Leskovec, 2016</xref>). With path sampling, network embedding is converted to word embedding with considers random walk as a sequence of words. Therefore, it is expected that vertices in a similar neighborhood get similar paths and hence similar representations.</p>
<p>Although recent methods show promising performance for various applications, graph embedding still has some challenges. First of all, many of these methods are still computationally expensive and need a large amount of memory, so they are not scalable to large graphs (scalability problem). Secondly, these approaches attempt to address the non-convex optimization goal using stochastic gradient descent, hence optimization on the co-occurrence probability of the vertices can easily get stuck at a bad local minima as the result of poor initialization (initialization problem). This may cause generating dissimilar representations for vertices within the same or similar neighborhood set. Also, many of these methods use local information with short random walks during the embedding by ignoring the global structure in the graph.</p>
<p>These challenges have motivated researchers to use graph compression (summarization) algorithm that reduces the complexity and size of large graphs. The aim of graph compression is to create a smaller supergraph from a massive graph such that the crucial information of the original graph will be maintained in the supergraph. Vertices with similar characteristics are grouped and represented by super-nodes. Approximations with compressing are used to solve original problems more efficiently, such as all-pairs shortest paths, search engine storage, and retrieval (<xref ref-type="bibr" rid="B1">Adler and Mitzenmacher, 2001</xref>; <xref ref-type="bibr" rid="B28">Suel and Yuan, 2001</xref>). Using an approximation of the original graph not only makes a complex problem simpler but also makes a good initialization to solve the problem. It has been proved successful in various graph theory problems (<xref ref-type="bibr" rid="B16">Gilbert and Levchenko, 2004</xref>). For the scalability problem, embedding on the coarsest graph is more efficient and needs far less memory that makes existing embedding methods applicable to large graphs. For the initialization problem, grouping vertices with similar characteristics in a compressed graph solves the problem of getting different representations for them.</p>
<p>HARP (<xref ref-type="bibr" rid="B12">Chen et al., 2018b</xref>) addresses the initialization problem by hierarchically compressing the graph by combining nodes into super-nodes randomly. Thus, it produces effective, low-level representation of nodes though multi-level learning. However, random edge compressing may put dissimilar nodes into the same super-node making their representation similar. Also, multi-level compressing and learning result in significant compression and embedding cost, hence HARP fails to address the scalability problem.</p>
<p>In this paper, we use graph compression to address these two problems and also the limitations of HARP. More precisely, we study graph compression for the Network Embedding problem to answer these two questions:</p>
<p>Does the Network Compression Significantly Boost Learning?</p>
<p>Does the Network Compression Improve the Quality of the Representation?</p>
<p>Our main goal is to obtain more <italic>efficient</italic> and more <italic>effective</italic> network embedding models as answers to these questions. For this goal, we present an extension of our first method, NECL, that is a general meta-strategy for network embedding. We propose a proximity-based graph compression method that compresses the input graph to a smaller graph with incorporating the neighborhood similarity of its vertices into super-nodes. NECL compresses the graph by merging vertices with similar neighbors into super-nodes instead of random edge merging, as HARP does. NECL employs the embedding of the compressed graph to obtain the embedding of the original graph. This brings down the embedding cost and captures the global structure of the original graph without losing locality kept in the super-nodes. In addition to reducing the graph&#x2019;s size for embedding, we get less pairwise relationships from random walks on a smaller set of super-nodes, which generates less diverse training data for the embedding part. All these facts improve efficiency while maintaining <italic>similar</italic> or <italic>better effectiveness</italic> comparing to the baseline methods. We then project the embedding of super-nodes to the original nodes.</p>
<p>In NECL, we primarily focus on improving the efficiency of embedding methods, so we do not employ refinement. As a result, we may lose some local information of the nodes because of merging. To overcome this problem, in this paper, we go beyond our original NECL by introducing an embedding refinement method NECL-RF. Our second method, NECL-RF, uses the compressed graph&#x2019;s projected embedding to initialize the representation for the original graph embedding. Refining these initial representations aids in learning the original graph&#x2019;s embedding. This provides global information of the graph into learning and also solves the different initialization problem of similar vertices, hence increases the effectiveness. Since the compressed graph is quite small compared to the original graph, the learning time will not increase significantly. Hence <italic>similar efficiency</italic> is maintained compared to the baseline methods. Moreover, we provide a richer set of experiments to evaluate NECL and NECL-RF. While in the earlier version, we only use DeepWalk and Node2vec as baseline methods for representation learning and combine them with NECL as a general meta-strategy. In this paper, we add one more baseline method, LINE, and present the results of all with NECL and NECL-RF.</p>
<p>E<sc>xample</sc> <bold>1.</bold> In <xref ref-type="fig" rid="F1">Figure 1</xref>, we present the effectiveness of our compressing and embedding model,NECL, on the well-known Les Miserables network. This undirected network contains co-occurrences of characters in Victor Hugo&#x2019;s novel &#x2018;Les Miserables&#x2019;. A node represents a character and an edge between two nodes shows that these two characters appeared in the same chapter of the book. While the original network has 77 vertices and 254 edges, the compressed network has 33 vertices and 64 edges. As we see in the figure, the compressed network preserves the local structure of vertices in super-nodes without losing the global structure of the graphs. It is expected that nodes close in a graph should also be close in the embedding space. For example, in <xref ref-type="fig" rid="F1">Figure 1A</xref> neighborhood sets of the vertices <inline-formula id="inf1">
<mml:math id="mml-math1-fdata.2020.608043">
<mml:mrow>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mn>1,4,5,6,7,8,9</mml:mn>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are same and including just node 0. Hence, random walks from these vertices have to pass from node 0 and get very similar walks and so very similar embedding. Instead of walking separately from each of these vertices, we just need to walk for the super-node seven in the compressed graph in <xref ref-type="fig" rid="F1">Figure 1B</xref> and learn one embedding. As presented in <xref ref-type="fig" rid="F1">Figures 1C,D</xref>, as the embedding of nodes with the original graph (C) and compressed graph (D), node proximity is preserved in the compressed graph. So, nodes close in original graph embedding are also close in compressed graph embedding.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Example of graph compressing on Les Miserables network (Original Network <bold>(A)</bold>, Compressed Network <bold>(B)</bold>, Embedding of Original Network <bold>(C)</bold> and Embedding of Compressed Network <bold>(D)</bold>).</p>
</caption>
<graphic xlink:href="fdata-03-608043-g001.tif"/>
</fig>
<p>We summarize our contributions as follows.<list list-type="bullet">
<list-item>
<p>&#x2022; New proximity-based graph compressing method: Based on the observation that vertices with similar neighborhood sets get similar random walks and eventually similar representation, we merge these vertices into super-nodes to get a smaller compressed graph that preserves the proximity of nodes in the original large graph.</p>
</list-item>
<list-item>
<p>&#x2022; Efficient embedding without losing effectiveness: We do random walks and embedding on the compressed graph, which is much smaller than the original graph, efficiently. This method has similar effectiveness with baseline methods by preserving the global and local structure of the graph in the compressed graph.</p>
</list-item>
<list-item>
<p>&#x2022; Effective embedding without decreasing efficiency: We use the embedding obtained from the compressed graph as initial vectors for the original graph embedding. This combines the global and local structure of the graph and improves the effectiveness. Embedding of a small compressed graph does not take much time with respect to original graph embedding, so it will not increase the embedding time significantly.</p>
</list-item>
<list-item>
<p>&#x2022; Generalizable: NECL is a general meta-strategy that can be used to improve the efficiency and effectiveness of many state-of-the-art graph embedding methods. We report the results for DeepWalk Node2vec and LINE.</p>
</list-item>
</list>
</p>
<p>&#x2022; The paper is structured as follows. In <xref ref-type="sec" rid="s2">Section 2</xref>, we give the necessary background for our method. and also provide related work. In <xref ref-type="sec" rid="s3">Section 3</xref>, we introduce our neighborhood similarity-based graph compression model by explaining our similarity measure and two different embedding methods that use the compressed graph. In <xref ref-type="sec" rid="s4">Section 4</xref>, we present our experimental results and compare them with the baseline methods. Our final remarks are reported in <xref ref-type="sec" rid="s5">Section 5</xref>.</p>
</sec>
<sec id="s2">
<label>2</label>
<title> Background</title>
<p>In this section, we discuss related works in the area of network embedding. We give some details of pioneer works in network embedding focusing on DeepWalk. We also explain random walk based sampling methods and multi-level network embedding approaches here.</p>
<sec id="s2-1">
<label>2.1</label>
<title> Network Embedding</title>
<p>Network embedding plays a significant role in network data analysis, and it has received huge research attention in recent years. Previous researchers consider the graph embedding as a dimensionality reduction (<xref ref-type="bibr" rid="B11">Chen et al., 2018a</xref>), such as PCA (<xref ref-type="bibr" rid="B33">Wold et al., 1987</xref>) that captures linear structural information and LE (locally linear embeddings) (<xref ref-type="bibr" rid="B27">Roweis and Saul, 2000</xref>) that preserves the global structure of non-linear manifolds. While these methods are effective on small graphs, scalability is the major concern with them being applied to large-scale networks with billions of vertices, since the time complexity of these methods is at least quadratic in the number of graph vertices (<xref ref-type="bibr" rid="B35">Zhang et al., 2017</xref>; <xref ref-type="bibr" rid="B32">Wang et al., 2018</xref>). On the other hand, recent approaches in graph representation learning focus on the scalable methods that use matrix factorization (<xref ref-type="bibr" rid="B26">Qiu et al., 2018</xref>; <xref ref-type="bibr" rid="B29">Sun et al., 2019</xref>) or neural networks (<xref ref-type="bibr" rid="B30">Tang et al., 2015</xref>; <xref ref-type="bibr" rid="B10">Cao et al., 2016</xref>; <xref ref-type="bibr" rid="B31">Tsitsulin et al., 2018</xref>; <xref ref-type="bibr" rid="B34">Ying et al., 2018</xref>). Many of these aim to preserve the first and second-order proximity as a local neighborhood with path sampling using short random walks such as DeepWalk and Node2vec (<xref ref-type="bibr" rid="B19">Hamilton et al., 2017</xref>; <xref ref-type="bibr" rid="B8">Cai et al., 2018</xref>; <xref ref-type="bibr" rid="B14">Cui et al., 2018</xref>; <xref ref-type="bibr" rid="B17">Goyal and Ferrara, 2018</xref>). Some recent studies aim to preserve higher-order proximity (<xref ref-type="bibr" rid="B24">Ou et al., 2016</xref>; <xref ref-type="bibr" rid="B12">Chen et al., 2018b</xref>). In addition to these, some recent works integrate contents to learn better representations (<xref ref-type="bibr" rid="B5">Akbas and Zhao, 2019</xref>). While some studies use network embedding on node and graph classification (<xref ref-type="bibr" rid="B25">Perozzi et al., 2014</xref>; <xref ref-type="bibr" rid="B23">Niepert et al., 2016</xref>; <xref ref-type="bibr" rid="B12">Chen et al., 2018b</xref>), some others use it on graph clustering (<xref ref-type="bibr" rid="B9">Cao et al., 2015</xref>; <xref ref-type="bibr" rid="B5">Akbas and Zhao, 2019</xref>; <xref ref-type="bibr" rid="B2">Akbas and Zhao, 2017</xref>).</p>
<p>DeepWalk (<xref ref-type="bibr" rid="B25">Perozzi et al., 2014</xref>) is the pioneering work that uses the idea of word representation learning in (<xref ref-type="bibr" rid="B21">Mikolov et al., 2013a</xref>; <xref ref-type="bibr" rid="B22">Mikolov et al., 2013b</xref>) for network embedding. While vertices in a graph are considered as words, neighbors are considered as their context in natural language. A graph is represented as a set of random walk paths sampled from it. The learning process leverages the co-occurrence probability of the vertices that appear within a window in a sampled path. The Skip-gram model is trained on the random walks to learn the node representation (<xref ref-type="bibr" rid="B21">Mikolov et al., 2013a</xref>; <xref ref-type="bibr" rid="B22">Mikolov et al., 2013b</xref>). We give the formal definition of network embedding as follows.</p>
<p>
<sc>Definition</sc> 1 (Network embedding). <italic>Network embedding is a mapping</italic> <inline-formula id="inf2">
<mml:math id="mml-math2-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mi>d</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x226a;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <italic>which represents each vertex</italic> <inline-formula id="inf3">
<mml:math id="mml-math3-fdata.2020.608043">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <italic>as a point in a low dimensional space</italic> <inline-formula id="inf4">
<mml:math id="mml-math4-fdata.2020.608043">
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mi>d</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>
<italic>.</italic>
</p>
<p>Here <italic>d</italic> is a parameter specifying the number of dimensions of our node representation. For every source node <inline-formula id="inf5">
<mml:math id="mml-math5-fdata.2020.608043">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, we define <inline-formula id="inf6">
<mml:math id="mml-math6-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2282;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> as a network neighborhood of node <italic>u</italic> generated through a neighborhood sampling strategy <italic>S</italic>. We seek to optimize the following objective function, which maximizes the log-probability of observing a network neighborhood <inline-formula id="inf7">
<mml:math id="mml-math7-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for a node <italic>u</italic> conditioned on its representation, given by &#x3d5;<disp-formula id="e1">
<mml:math>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
<mml:mi>f</mml:mi>
</mml:munder>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>There is an assumption that the conditional independence of vertices will ignore the vertex ordering in the neighborhood sampling to make the optimization problem tractable. Therefore, the likelihood is factorized by assuming that the likelihood of observing a neighborhood node is independent of observing any other neighborhood node given the representation of the source<disp-formula id="e2">
<mml:math>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo>&#x220f;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:munder>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The conditional likelihood of every source-neighborhood node pair is modeled as a softmax unit parametrized by a dot product of their features.<disp-formula id="e3">
<mml:math>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>exp</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mtext>exp</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>It is too expensive to compute the summation over all vertices for large networks, so we approximate it using negative sampling (<xref ref-type="bibr" rid="B22">Mikolov et al., 2013b</xref>). We optimize <xref ref-type="disp-formula" rid="e1">Equation 1</xref> using stochastic gradient ascent over the model parameters defining the embedding &#x3d5;.</p>
<sec id="s2-1-1">
<label>2.1.1</label>
<title> Random Walk Based Sampling</title>
<p>The neighborhoods <inline-formula id="inf8">
<mml:math id="mml-math8-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are not restricted to just immediate neighbors but can have vastly different structures depending on the sampling strategy <italic>S</italic>. There are many possible neighborhood sampling strategies for vertices as a form of local search. Different neighborhoods coming from different strategies result in different learned feature representations. For scalability of learning, random walk based methods are used to capture the structural relationships of vertices. They maximize the co-occurrence probability of subsequent vertices within a fixed-length window of random walks to preserve higher-order proximity between vertices. With random walks, networks are represented as a collection of vertex sequence. In this section, we take a deeper look at the network neighborhood sampling strategy based on random walks and the proximity captured by random walks.</p>
<p>The co-occurrence probability of node pairs depends on the transition probabilities of vertices. Considering a graph <italic>G</italic>, we define adjacency matrix <italic>A</italic> that is symmetric for undirected graphs. For an unweighted graph, we have <inline-formula id="inf9">
<mml:math id="mml-math9-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> if and only if there exists an edge from <inline-formula id="inf10">
<mml:math id="mml-math10-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf11">
<mml:math id="mml-math11-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf12">
<mml:math id="mml-math12-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> otherwise. For a graph with adjacency matrix <italic>A</italic>, we can define the diagonal matrix, known as degree matrix, as <inline-formula id="inf13">
<mml:math id="mml-math13-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mi>k</mml:mi>
</mml:munder>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> if <inline-formula id="inf14">
<mml:math id="mml-math14-fdata.2020.608043">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf15">
<mml:math id="mml-math15-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> otherwise. In a random walk, transition probability from one node to another depends on the degree of the vertices. The probability of leaving a node from one of its edges is split uniformly among the edges. We define this one step transition probability as <italic>T</italic>: <inline-formula id="inf16">
<mml:math id="mml-math16-fdata.2020.608043">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> where <inline-formula id="inf17">
<mml:math id="mml-math17-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the probability of a transition from vertex <inline-formula id="inf18">
<mml:math id="mml-math18-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to vertex <inline-formula id="inf19">
<mml:math id="mml-math19-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> within one step.</p>
</sec>
<sec id="s2-1-2">
<label>2.1.2</label>
<title> Multi-Level Network Embedding</title>
<p>Optimization of a non-convex function in these methods could easily get stuck at a bad local minima as the result of poor initialization. Moreover, while preserving local proximities of vertices in a network, they may not preserve the global structure of the network. As a solution to these issues, a multi-level graph representation learning paradigm has been proposed (<xref ref-type="bibr" rid="B12">Chen et al., 2018b</xref>; <xref ref-type="bibr" rid="B6">Ayan Kumar Bhowmick and Meneni, 2020</xref>; <xref ref-type="bibr" rid="B20">Liang et al., 2018</xref>; <xref ref-type="bibr" rid="B13">Chenhui Deng and Zhao, 2020</xref>). HARP, is proposed in (<xref ref-type="bibr" rid="B12">Chen et al., 2018b</xref>) as a graph preprocessing step to get better initialization vectors. In this approach, related vertices in the network are hierarchically combined into super-nodes at varying levels of coarseness. After learning the embedding of the coarsened network with a state-of-the-art graph embedding method, the learned embedding is used as an initial value for the next level. The initialization with the embedding of the coarsened network improves the performance of the state-of-the-art methods. One of the limitations of this method is that multi-level compressing and learning results in significant compression and embedding cost. Random edge compressing may put dissimilar nodes into the same super-node that makes their representation similar.</p>
<p>As a more efficient solution, MILE (<xref ref-type="bibr" rid="B20">Liang et al., 2018</xref>) performs multi-level network embedding on large graphs using graph coarsening and refining techniques. It compresses the graph repeatedly based on Structural Equivalence Matching (SEM) and Normalized Heavy Edge Matching (NHEM). After learning the embedding of the compressed graph, they refine it efficiently through a novel graph convolution neural network to get the embedding of the original graph. This way, it receives embedding for large scale graphs in an efficient and effective way. More recently, GraphZoom (<xref ref-type="bibr" rid="B13">Chenhui Deng and Zhao, 2020</xref>) proposes a multi-level spectral approach to enhange both the quality and scalability. It performs graph fusion to generate a new graph that effectively encodes the topology of the original graph and the node attribute information. Then they apply spectral clustering methods to merge the nodes into super-nodes with the aim of retaining the first few eigenvectors of the graph Laplacian matrix. Finally, after getting the embedding of the compressed graph, they refine it by applying projection on it to get the original graph embedding. LouvainNE (<xref ref-type="bibr" rid="B6">Ayan Kumar Bhowmick and Meneni, 2020</xref>) applies the Louvain clustering algorithm recursively to partition the original graph into multiple subgraphs and construct a Hierarchy partition of the graph, which is represented as a tree. Then they generate different meta-graph from the tree and apply the baseline method i.e., DeepWalk Node2vec. After getting the embedding from different meta-graph, they combine these embeddings to find the final embedding. They use a parameter to regulate the weights of different embedding for combining.</p>
<p>Our approach differs from these by applying similarity-based compressing to preserve the local information. Also, all of these approaches apply hierarchical compressing that may take more time, but we apply single level compressing and use it to get the original graph embedding. NECL uses the graph coarsening to capture the local structure of the network without a hierarchical manner to improve the efficiency of the random walk based state-of-the-art methods.</p>
</sec>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<label>3</label>
<title> Methodology</title>
<p>While a desirable network embedding method for real-world networks should preserve the local proximity between vertices and the global structure of the graph, it should also be scalable for large networks. This section presents our novel network embedding models, NECL and NECL-RF, which satisfy these requirements. We extend the idea of the graph compressing layout to network representation learning methods. After giving some preliminary information, we explain our proximity-based compression method and how we combine compression with network embedding.</p>
<p>In this paper, we consider an undirected, connected, simple graph <inline-formula id="inf20">
<mml:math id="mml-math20-fdata.2020.608043">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> where <inline-formula id="inf21">
<mml:math id="mml-math21-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the set of vertices, and <inline-formula id="inf22">
<mml:math id="mml-math22-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
<mml:mo>&#x2286;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the set of edges. The set of neighbors for a given vertex <inline-formula id="inf23">
<mml:math id="mml-math23-fdata.2020.608043">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is denoted as <inline-formula id="inf24">
<mml:math id="mml-math24-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf25">
<mml:math id="mml-math25-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. We now define what a compressed graph is.</p>
<p>D<sc>efinition</sc> 2 (Compressed graph). <italic>A compressed graph of a given graph</italic> <inline-formula id="inf26">
<mml:math id="mml-math26-fdata.2020.608043">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> <italic>is represented as</italic> <inline-formula id="inf27">
<mml:math id="mml-math27-fdata.2020.608043">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>G</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> <italic>where</italic> <inline-formula id="inf28">
<mml:math id="mml-math28-fdata.2020.608043">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> <italic>is the graph summary with super-nodes</italic> <inline-formula id="inf29">
<mml:math id="mml-math29-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> <italic>and super-edges</italic> <inline-formula id="inf30">
<mml:math id="mml-math30-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> <italic>and M is a mapping from each node v in &#x0024;V_G&#x0024; to its super-node in</italic> <inline-formula id="inf31">
<mml:math id="mml-math31-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
<italic>. A super-edge</italic> <inline-formula id="inf32">
<mml:math id="mml-math32-fdata.2020.608043">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> <italic>in</italic> <inline-formula id="inf33">
<mml:math id="mml-math33-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> <italic>represents the set of all edges between vertices in the super-nodes</italic> <inline-formula id="inf34">
<mml:math id="mml-math34-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> <italic>and</italic> <inline-formula id="inf35">
<mml:math id="mml-math35-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
<italic>.</italic>
</p>
<sec id="s3-1">
<label>3.1</label>
<title> Neighborhood Similarity-Based Graph Compression</title>
<p>The critical problem for graph compressing with preserving local structures of the graph is to identify vertices that have similar neighborhoods accurately, so they are more likely to have similar representation. In this section, we discuss how to select vertices to merge into super-nodes.</p>
<sec id="s3-1-1">
<label>3.1.1</label>
<title> Motivation</title>
<p>The motivation of our method is that if two vertices have many common neighbors, many embedding algorithms that preserve local neighborhood information will give similar representations to them. This comes from our following observation that if two vertices, <inline-formula id="inf36">
<mml:math id="mml-math36-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf37">
<mml:math id="mml-math37-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, of a graph have many common neighbors, they also have similar transition probabilities to other vertices. This means that if <inline-formula id="inf38">
<mml:math id="mml-math38-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf39">
<mml:math id="mml-math39-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are similar, their transition probability vectors, <inline-formula id="inf40">
<mml:math id="mml-math40-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mtext>&#x2a;</mml:mtext>
<mml:msubsup>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf41">
<mml:math id="mml-math41-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mtext>&#x2a;</mml:mtext>
<mml:msubsup>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, will be similar as well. Hence they have similar neighborhoods and get similar neighborhood sets from random walks, and as a result, they get similar representations from the learning process.</p>
<p>For example, in the toy graph in <xref ref-type="fig" rid="F2">Figure 2</xref>, the neighbor sets of the nodes <italic>a</italic> and <italic>b</italic> are the same. Hence, their transition probabilities to the other neighbor vertices are also the same, i.e., <inline-formula id="inf42">
<mml:math id="mml-math42-fdata.2020.608043">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> for all <inline-formula id="inf43">
<mml:math id="mml-math43-fdata.2020.608043">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mn>1,2,3,4</mml:mn>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Starting on either <italic>a</italic> or <italic>b</italic> yields the same or very similar walks, so they have the same or similar representation. Therefore, instead of walking and learning representations for both <italic>a</italic> and <italic>b</italic>, it is enough to learn one for both of them. For this, we can merge this node pair <inline-formula id="inf44">
<mml:math id="mml-math44-fdata.2020.608043">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> into one super-node <inline-formula id="inf45">
<mml:math id="mml-math45-fdata.2020.608043">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Transition probabilities of this super-node to neighbors of <italic>a</italic> and <italic>b</italic> are still the same with <italic>a</italic> and <italic>b</italic>, i.e., <inline-formula id="inf46">
<mml:math id="mml-math46-fdata.2020.608043">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> for all <inline-formula id="inf47">
<mml:math id="mml-math47-fdata.2020.608043">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mn>1,2,3,4</mml:mn>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. When we obtain the representation of the super-node <inline-formula id="inf48">
<mml:math id="mml-math48-fdata.2020.608043">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, we can project it as the representation of each node in this pair. Merging these vertices keeps the preservation of the first and second-order proximity. Thus, this does not affect the results of walking and learning, but it increases efficiency.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Example of graph compressing. <bold>(a,b)</bold> are merged into super-node ab connected to the neighbors of both <bold>(a,b)</bold>.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g002.tif"/>
</fig>
<p>Furthermore, compressing may change the transition probability of neighbors of compressed vertices since the number of their neighbor decrease. As a result, the transition probability of each neighbor changes. For example, in the toy graph in <xref ref-type="fig" rid="F2">Figure 2A</xref>, while the transition probability from <inline-formula id="inf49">
<mml:math id="mml-math49-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to its neighbors is <inline-formula id="inf50">
<mml:math id="mml-math50-fdata.2020.608043">
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, after compressing, it becomes <inline-formula id="inf51">
<mml:math id="mml-math51-fdata.2020.608043">
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> since the number of neighbors decrease by one. In order to avoid this problem, we assign weights to edges of super-nodes based on the number of merged edges within the compression. For example, the super-edge between super-node <inline-formula id="inf52">
<mml:math id="mml-math52-fdata.2020.608043">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf53">
<mml:math id="mml-math53-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> includes two edges which are <inline-formula id="inf54">
<mml:math id="mml-math54-fdata.2020.608043">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf55">
<mml:math id="mml-math55-fdata.2020.608043">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Therefore, the weight of the super-edge (<inline-formula id="inf56">
<mml:math id="mml-math56-fdata.2020.608043">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) should be 2.</p>
<p>In a real-world graph, it is not expected to have too many vertices sharing exactly same neighborhood. However, for many graph mining problems, such as node classification and graph clustering, if two vertices share many common neighbors, they are expected to be in the same class or cluster, although their neighbor sets are not completely the same. Hence, we expect to have similar feature vectors for the vertices in the same class/cluster after embedding. From these observations, we can also apply the same merge operation on these vertices. Following the same idea in the example above, if neighbors of two vertices are similar (but not exactly the same), we can merge them into a super-node and learn one representation for all. While we can project this super-node embedding to original vertices and use the same representation for both, we can also update them in the refinement phase to embed the difference of them into their representation.</p>
</sec>
<sec id="s3-1-2">
<label>3.1.2</label>
<title> Proximity Based Graph Compressing</title>
<p>In this section, we define our graph compressing algorithm formally.</p>
<p>For a given graph <italic>G</italic>, if a set of vertices <inline-formula id="inf57">
<mml:math id="mml-math57-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in <inline-formula id="inf58">
<mml:math id="mml-math58-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> have similar neighbors, we merge these vertices into one super-node <inline-formula id="inf59">
<mml:math id="mml-math59-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mn>12...</mml:mn>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to get a smaller compressed graph <inline-formula id="inf60">
<mml:math id="mml-math60-fdata.2020.608043">
<mml:mrow>
<mml:msup>
<mml:mi>G</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. To decide which vertices to merge, we define the <italic>neighborhood similarity</italic> based on the transition probability. Before defining the neighborhood similarity, we first show that cosine similarity between transition probabilities of two vertices <italic>u</italic> and <italic>v</italic>, <inline-formula id="inf61">
<mml:math id="mml-math61-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf62">
<mml:math id="mml-math62-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, are determined by the number of their common neighbors.</p>
<p>T<sc>heorem</sc> 1. Let T be the 1-step transition probability matrix of vertices V in a graph G and let <inline-formula id="inf63">
<mml:math id="mml-math63-fdata.2020.608043">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. Let <inline-formula id="inf64">
<mml:math id="mml-math64-fdata.2020.608043">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf65">
<mml:math id="mml-math65-fdata.2020.608043">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> be the neighborhood sets of <italic>u</italic> and <italic>v</italic> and <inline-formula id="inf66">
<mml:math id="mml-math66-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf67">
<mml:math id="mml-math67-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> be the transition probability vectors from u and v to other vertices. Then the similarity between <inline-formula id="inf68">
<mml:math id="mml-math68-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf69">
<mml:math id="mml-math69-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is proportional to the number of common neighbors, <inline-formula id="inf70">
<mml:math id="mml-math70-fdata.2020.608043">
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>P<sc>roof</sc>. The cosine similarity between <inline-formula id="inf71">
<mml:math id="mml-math71-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf72">
<mml:math id="mml-math72-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is defined by<disp-formula id="e4">
<mml:math>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>By definition of <italic>T</italic>, we have <inline-formula id="inf73">
<mml:math id="mml-math73-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf74">
<mml:math id="mml-math74-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>. Furthermore, we have<disp-formula id="e5">
<mml:math>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:math>
</disp-formula>and<disp-formula id="e6">
<mml:math>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
</mml:munder>
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>Hence, if we plug in these into <xref ref-type="disp-formula" rid="e1">Equation 1</xref>, we get<disp-formula id="e7">
<mml:math>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
</disp-formula>Therefore,<disp-formula id="e8">
<mml:math>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x221d;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>This finalizes the proof.</p>
<p>From Theorem 1, we see that the similarity of transition probabilities from two vertices to other vertices depends on the similarity of their neighbors. Therefore, for the compressing, we define the neighborhood similarity between two vertices as follows.</p>
<p>
<sc>Definition</sc> 3 (Neighborhood similarity) Given a graph G, the neighborhood similarity between two vertices <inline-formula id="inf75">
<mml:math id="mml-math75-fdata.2020.608043">
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is given by<disp-formula id="e9">
<mml:math>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x2b;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>In order to normalize the effect of high degree vertices, we divide the number of common neighbors by degree of vertices. The neighborhood similarity is between 0 and 1, where it is 0 when two vertices have no common neighbor and one when both have the same neighbors. According to the neighbor similarity, we merge vertices whose similarity value is higher than a given threshold value.</p>
<p>The neighborhood similarity-based graph compressing algorithm is given in <xref ref-type="fig" rid="F13">Algorithm 1</xref>. It is clear that the vertices with a nonzero neighborhood similarity are 2-step neighbors. Therefore, we do not need to compute the similarity between all pairs of vertices. Instead, we just need to compute the similarity between vertices and their neighbors&#x2019; neighbors. For each node <inline-formula id="inf77">
<mml:math id="mml-math77-fdata.2020.608043">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, we compute the similarity between <italic>v</italic> and each <italic>k</italic> as neighbors of neighbors (line 3&#x2013;10). Then, we check the similarity value of all pairs (<italic>u</italic>, <italic>k</italic>) in the list and if it is higher than the given threshold &#x3bb; (line 12), we merge <italic>u</italic> and <italic>k</italic> into a super-node <inline-formula id="inf78">
<mml:math id="mml-math78-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (line 13). Then we delete edges of <italic>u</italic> and <italic>k</italic> and add edges between neighbors of <italic>u</italic> and <italic>k</italic> and the new super-node <inline-formula id="inf79">
<mml:math id="mml-math79-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (line 17&#x2013;24). We assign weights to the edges of super-nodes based on the number of merged edges within the compression. Threshold &#x3bb; decides the trade-off between efficiency and effectiveness. If we use a larger value, it will merge a less number of vertices. On the other hand, if we use a smaller value, we merge more vertices, and as a side effect, we may merge some dissimilar vertices as well, which may result in an increase in efficiency but cause a decrease in accuracy. Note that since we use original neighborhood similarity, the order of merging does not affect the result, so we randomly select a node and check neighbors for compression. Furthermore, one super-node may include more than two vertices of the original graph. For example, if the similarity between the vertices <italic>x</italic> and <italic>y</italic>, <inline-formula id="inf80">
<mml:math id="mml-math80-fdata.2020.608043">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and the vertices <italic>y</italic> and <italic>z</italic>, <inline-formula id="inf81">
<mml:math id="mml-math81-fdata.2020.608043">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, are both bigger than given threshold, we merge <italic>x</italic> and <italic>y</italic> in <inline-formula id="inf82">
<mml:math id="mml-math82-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and then we merge <inline-formula id="inf83">
<mml:math id="mml-math83-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <italic>z</italic> into <inline-formula id="inf84">
<mml:math id="mml-math84-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Therefore, during the merge operation, we check whether the node <italic>y</italic> is merged with another node and if so, we get the super-node of the original node <italic>x</italic>.</p>
<fig id="F13" position="float">
<label>Algorithm 1</label>
<caption>
<p>Graph Compressing (<italic>G</italic>, &#x03BB;).</p>
</caption>
<graphic xlink:href="fdata-03-608043-g013.tif"/>
</fig>
</sec>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title> Network Embedding</title>
<p>Our NECL framework is adaptive with any embedding method which preserves the neighborhood proximity of nodes, i.e., DeepWalk, Node2vec, and LINE. We get the embedding for the original graph in two ways.</p>
<sec id="s3-2-1">
<label>3.2.1</label>
<title> Network Embedding on Compressed Graph</title>
<p>Our main goal in this section is to <italic>improve the efficiency</italic> of the embedding problem while maintaining <italic>similar effectiveness</italic> with the baseline methods. For this goal, instead of embedding the original graph, we embed the compressed graph and employ this embedding for the original graph embedding.</p>
<p>We first start compressing the graph for a given similarity threshold, as explained in the previous section. Then we learn the embedding of super-nodes in the compressed graph. Next, we assign the representation of each super-node in the compressed graph as the representation of the corresponding vertices in each super-node and obtain the embedding of the original graph. Since the size of the compressed graph is much smaller than the original graph, the embedding will be more efficient. The details of our algorithm for network embedding on a compressed graph is given in <xref ref-type="fig" rid="F14">Algorithm 2</xref>.</p>
<fig id="F14" position="float">
<label>Algorithm 2</label>
<caption>
<p>NECL: Network Embedding on Compressed Graph.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g014.tif"/>
</fig>
<p>In the algorithm, after getting the weighted compressed graph <italic>S</italic> (line 1), we obtain the representation of super-nodes <inline-formula id="inf85">
<mml:math id="mml-math85-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as <inline-formula id="inf86">
<mml:math id="mml-math86-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3d5;</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the compressed graph with the provided network embedding algorithm (line 2). We apply any random walk based representation learning algorithm on the compressed graph. We just need to apply weighted random walks to consider the edge weights. As the size of the compressed graph is smaller than the original graph, it is more efficient to get embeddings of super-nodes than single vertices. Finally, we assign the embedding of super-nodes to vertices according to the mapping <italic>M</italic> obtained from the compression (line 3&#x2013;7). While we may lose some local information with assigning the same representation to multiple vertices, we gain efficiency. Also, we may not need to get small differences between nodes for many applications, e.g., classification, as we see in <xref ref-type="sec" rid="s4">Section 4</xref>.</p>
</sec>
<sec id="s3-2-2">
<label>3.2.2</label>
<title> Network Embedding With Refinement</title>
<p>Our main goal in this section is to <italic>improve the effectiveness</italic> of the embedding problem while still maintaining <italic>similar efficiency</italic> with the baseline methods. For this goal, we employ the embedding of the compressed graph as initialization to the original graph embedding and refine it.</p>
<p>When we compress a graph using the neighborhood similarity score, we can easily capture the global structure of the original graph. On a large original graph, the random walk may get stuck in a local neighborhood. As a result, the embedding method may not capture the global structure of the original graph. However, when we do the random walk on the compressed graph, it visits the globally similar neighbors nodes. Hence, we can capture the global proximity of the nodes. That is why, in this method, we first embed the compressed graph for a given similarity threshold to encode the original graph&#x2019;s global structure in the representation as in <xref ref-type="sec" rid="s3-2-1">Section 3.2.1</xref>. Then, for the embedding of the original graph, instead of starting with randomly initialized representations, which happens in the original embedding methods such as DeepWalk and Node2vec, we start with the representations obtained from the compressed graph. In the case of random representations, for example, two similar nodes are likely to have two very different and distanced representations, hence the optimization process may not provide an accurate representation and this may decrease the quality or it may take a longer time to make them similar. However, initializing the representation using the compressed graph embedding provides global structure information as an initial knowledge to the embedding. The original graph embedding updates this initial embedding with local information that may be lost with compressing. Therefore, final embeddings have better quality with integrating local and global information in one representation. In <xref ref-type="fig" rid="F14">Algorithm 2</xref>, the original graph embedding is obtained in line eight by refining the compressed graph embedding given as the initial representation.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title> Experiments</title>
<p>We do our experimental studies to compare our methods with different models in terms of efficiency and effectiveness. We evaluate the quality of embeddings through challenging multi-class and multi-label classification tasks on four popular real-world graph datasets. First, in <xref ref-type="sec" rid="s4-1">Section 4.1</xref>, we present our model&#x2019;s performance based on different parameters. Then, we compare the results of our models with the results of HARP.</p>
<p>
<bold>Datasets:</bold> We consider four real-world graphs<sup>1</sup>, which have been widely adopted in the network embedding studies. Two of them are single-label, which are Wiki and Citeseer, and two of them are multi-label datasets, which are DBLP and BlogCatalog (BlogC). In single-label datasets, each node in the datasets has a single-label from multi-class values. In multi-label datasets, a node can belong to more than one class.</p>
<p>
<bold>Baseline methods:</bold> To demonstrate that our methods can work with different graph embedding methods, we use three popular graph embedding methods, namely DeepWalk, Node2vec and LINE, as the baseline methods in our model. We combine each baseline method with our methods and compare their performance. We give a brief explanation of the baseline methods in Section 2. We named our first method as NECL, which uses a compressed graph embedding as the original graph embedding, and the second method as NECL-RF, which uses the compressed graph embedding as the initial vector for original graph embedding and refine it with the original graph.</p>
<p>
<bold>Parameter Settings:</bold> For DeepWalk, Node2vec, NECL(DW), NECL(N2V), NECL-RF(DW) and NECL-RF(N2V), we set the following parameters: the number of random walks &#x3b3;, walk length <italic>t</italic>, window size <italic>w</italic> for the Skip-gram model and representation size <italic>d</italic>. The parameter setting for all models is <inline-formula id="inf87">
<mml:math id="mml-math87-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>40</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf88">
<mml:math id="mml-math88-fdata.2020.608043">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf89">
<mml:math id="mml-math89-fdata.2020.608043">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>10</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf90">
<mml:math id="mml-math90-fdata.2020.608043">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. The initial learning rate and final learning rate are set to 0.025 and 0.001 respectively in all models. Representation size for LINE is <inline-formula id="inf91">
<mml:math id="mml-math91-fdata.2020.608043">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> for all model.</p>
<p>
<bold>Classification</bold> We present our results and compare them with the baseline methods and also HARP in single-label and multi-label classification tasks. For the single classification task, the multi-class SVM is employed as the classifier, which uses the one-vs-rest scheme. For the multi-label classification task, we train a one-vs-rest logistic regression model with <inline-formula id="inf92">
<mml:math id="mml-math92-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> regularization on the graph embeddings for prediction. The logistic regression model is implemented with LibLinear (<xref ref-type="bibr" rid="B15">Fan et al., 2008</xref>).</p>
<p>For the evaluation, after getting embeddings for nodes in the graph, we use these embeddings as the features of the nodes. Then, we train a classifier using these features. To train the classifier, we randomly sample a certain portion of labeled vertices from the graph and use the rest of the vertices as the test data. To have a detailed comparison of methods, we vary our training ratio from <inline-formula id="inf93">
<mml:math id="mml-math93-fdata.2020.608043">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mtext>%</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf94">
<mml:math id="mml-math94-fdata.2020.608043">
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mtext>%</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> on the Citeseer, Wiki, and DBLP datasets and from <inline-formula id="inf95">
<mml:math id="mml-math95-fdata.2020.608043">
<mml:mrow>
<mml:mn>10</mml:mn>
<mml:mtext>%</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> to <inline-formula id="inf96">
<mml:math id="mml-math96-fdata.2020.608043">
<mml:mrow>
<mml:mn>80</mml:mn>
<mml:mtext>%</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> for BlogCatalog. We use larger portion training data for the BlogCatalog dataset because the number of class labels of BlogCatalog is about ten times other graphs.</p>
<p>We repeat the classification tasks ten times to ensure the reliability of our experiment and report the average macro <inline-formula id="inf97">
<mml:math id="mml-math97-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf98">
<mml:math id="mml-math98-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores and embedding times of our models with different parameter. Since our focus is improving the efficiency of embeddings, we report the time for embedding and do not include compression time. However, as we explain in the methodology section, we just need to compute the similarity between vertices and their neighbors&#x2019; neighbors and combine them into supernodes. Furthermore, the computation is not multi-level, just one-time computation. Therefore, the compression part does not have high complexity and it does not have an impact on efficiency. All experiments are performed on a server running Ubuntu 14:04 with four Intel 2.6&#xa0;GHz ten-core CPUs and 48&#xa0;GB of memory. All data and code are publicly available through this link: <ext-link ext-link-type="uri" xlink:href="https://github.com/esraabil/NECL">https://github.com/esraabil/NECL</ext-link>.</p>
<sec id="s4-1">
<label>4.1</label>
<title> Analysis of NECL</title>
<p>We present our results in <xref ref-type="table" rid="T1 T2">Tables 1, 2</xref>. For the similarity threshold <inline-formula id="inf99">
<mml:math id="mml-math99-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the compressed graph becomes very small and gives low macro <inline-formula id="inf100">
<mml:math id="mml-math100-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf101">
<mml:math id="mml-math101-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores. Since it also merges more nodes into super-nodes with a low similarity value, this may result in information loss on the graph. Hence, we set the cutting point of compression at <inline-formula id="inf102">
<mml:math id="mml-math102-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Moreover, to see the effect of the similarity threshold value &#x3bb; on the compression and accuracy, we vary it from 0.45 to 1. We present the macro <inline-formula id="inf103">
<mml:math id="mml-math103-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf104">
<mml:math id="mml-math104-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores with respect to the fraction of labeled data in <xref ref-type="fig" rid="F3">Figures 3</xref>&#x2013;<xref ref-type="fig" rid="F6">6</xref> and embedding times in <xref ref-type="fig" rid="F7">Figure 7</xref>. We also report the number of edges and vertices in the compressed graph with respect to similarity threshold &#x3bb; on <xref ref-type="fig" rid="F8">Figure 8</xref> to see the effectiveness of the graph compression algorithm.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Performance comparisons of NECL with baseline methods (BL).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th align="left"/>
<th colspan="3" align="center">Macro F<sub>1</sub>
</th>
<th colspan="3" align="center">Micro F<sub>1</sub>
</th>
<th colspan="3" align="center">Time (s)</th>
</tr>
<tr>
<th colspan="2" align="left"/>
<th align="center">NECL</th>
<th align="center">BL</th>
<th align="center">Gain%</th>
<th align="center">NECL</th>
<th align="center">BL</th>
<th align="center">Gain%</th>
<th align="center">NECL</th>
<th align="center">BL</th>
<th align="center">Gain%</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left"/>
<td align="center">DW</td>
<td align="center">0.434</td>
<td align="center">0.408</td>
<td align="center">6.4</td>
<td align="center">0.469</td>
<td align="center">0.440</td>
<td align="center">6.6</td>
<td align="center">9.26</td>
<td align="center">16.21</td>
<td align="center">42.9</td>
</tr>
<tr>
<td align="left"/>
<td align="center">N2V</td>
<td align="center">0.439</td>
<td align="center">0.437</td>
<td align="center">0.5</td>
<td align="center">0.475</td>
<td align="center">0.472</td>
<td align="center">0.6</td>
<td align="center">8.95</td>
<td align="center">15.46</td>
<td align="center">42.1</td>
</tr>
<tr>
<td align="left">Citeseer</td>
<td align="center">Line</td>
<td align="center">0.317</td>
<td align="center">0.320</td>
<td align="center">-0.9</td>
<td align="center">0.355</td>
<td align="center">0.359</td>
<td align="center">-1.1</td>
<td align="center">0.67</td>
<td align="center">1.43</td>
<td align="center">53.1</td>
</tr>
<tr>
<td align="left"/>
<td align="center">DW</td>
<td align="center">0.390</td>
<td align="center">0.373</td>
<td align="center">4.6</td>
<td align="center">0.497</td>
<td align="center">0.483</td>
<td align="center">2.9</td>
<td align="center">4.84</td>
<td align="center">8.98</td>
<td align="center">46.0</td>
</tr>
<tr>
<td align="left"/>
<td align="center">N2V</td>
<td align="center">0.349</td>
<td align="center">0.348</td>
<td align="center">1.0</td>
<td align="center">0.489</td>
<td align="center">0.490</td>
<td align="center">-0.2</td>
<td align="center">9.41</td>
<td align="center">19.10</td>
<td align="center">50.7</td>
</tr>
<tr>
<td align="left">Wiki</td>
<td align="center">Line</td>
<td align="center">0.355</td>
<td align="center">0.369</td>
<td align="center">-3.8</td>
<td align="center">0.517</td>
<td align="center">0.518</td>
<td align="center">0.2</td>
<td align="center">1.28</td>
<td align="center">3.81</td>
<td align="center">66.4</td>
</tr>
<tr>
<td align="left"/>
<td align="center">DW</td>
<td align="center">0.625</td>
<td align="center">0.603</td>
<td align="center">3.6</td>
<td align="center">0.656</td>
<td align="center">0.635</td>
<td align="center">3.3</td>
<td align="center">39.97</td>
<td align="center">93.96</td>
<td align="center">57.5</td>
</tr>
<tr>
<td align="left"/>
<td align="center">N2V</td>
<td align="center">0.626</td>
<td align="center">0.624</td>
<td align="center">0.3</td>
<td align="center">0.657</td>
<td align="center">0.653</td>
<td align="center">0.6</td>
<td align="center">75.81</td>
<td align="center">175.31</td>
<td align="center">56.8</td>
</tr>
<tr>
<td align="left">DBLP</td>
<td align="center">Line</td>
<td align="center">0.595</td>
<td align="center">0.593</td>
<td align="center">0.3</td>
<td align="center">0.649</td>
<td align="center">0.645</td>
<td align="center">0.6</td>
<td align="center">9.94</td>
<td align="center">28.58</td>
<td align="center">65.2</td>
</tr>
<tr>
<td align="left"/>
<td align="center">DW</td>
<td align="center">0.246</td>
<td align="center">0.245</td>
<td align="center">0.4</td>
<td align="center">0.388</td>
<td align="center">0.387</td>
<td align="center">0.2</td>
<td align="center">71.7</td>
<td align="center">99.3</td>
<td align="center">27.7</td>
</tr>
<tr>
<td align="left"/>
<td align="center">N2V</td>
<td align="center">0.252</td>
<td align="center">0.251</td>
<td align="center">0.3</td>
<td align="center">0.391</td>
<td align="center">0.389</td>
<td align="center">-0.5</td>
<td align="center">1,247</td>
<td align="center">1,628</td>
<td align="center">23.4</td>
</tr>
<tr>
<td align="left">BlogC</td>
<td align="center">Line</td>
<td align="center">0.215</td>
<td align="center">0.219</td>
<td align="center">-1.8</td>
<td align="center">0.369</td>
<td align="center">0.373</td>
<td align="center">-1.1</td>
<td align="center">99.35</td>
<td align="center">126.65</td>
<td align="center">21.6</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Compression ratio with the similarity threshold <inline-formula id="inf105">
<mml:math id="mml-math105-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th colspan="3" align="center">
<inline-formula id="inf106">
<mml:math id="mml-math106-fdata.2020.608043">
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th colspan="3" align="center">
<inline-formula id="inf107">
<mml:math id="mml-math107-fdata.2020.608043">
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
</tr>
<tr>
<td align="left"/>
<td align="center">Compressed</td>
<td align="center">Original</td>
<td align="center">Ratio %</td>
<td align="center">Compressed</td>
<td align="center">Original</td>
<td align="center">Ratio %</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Citeseer</td>
<td align="center">1,427</td>
<td align="center">2,708</td>
<td align="center">47.3</td>
<td align="center">5,236</td>
<td align="center">10,858</td>
<td align="center">51.8</td>
</tr>
<tr>
<td align="left">Wiki</td>
<td align="center">1,060</td>
<td align="center">2,405</td>
<td align="center">55.9</td>
<td align="center">8,584</td>
<td align="center">23,192</td>
<td align="center">63</td>
</tr>
<tr>
<td align="left">DBLP</td>
<td align="center">8,824</td>
<td align="center">27,199</td>
<td align="center">69.9</td>
<td align="center">32,984</td>
<td align="center">133664</td>
<td align="center">75.3</td>
</tr>
<tr>
<td align="left">BlogC</td>
<td align="center">8,507</td>
<td align="center">10,312</td>
<td align="center">17.5</td>
<td align="center">543872</td>
<td align="center">667966</td>
<td align="center">18.6</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Detailed classification results on Citeseer.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g003.tif"/>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Detailed classification results on Wiki.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g004.tif"/>
</fig>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Detailed classification results on DBLP.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g005.tif"/>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Detailed classification results on BlogCatalog.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g006.tif"/>
</fig>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Run time analyses for different similarity threshold values &#x3bb; (Citeseer <bold>(A)</bold>, Wiki <bold>(B)</bold>, DBLP <bold>(C)</bold> and BlogCatalog <bold>(D)</bold>).</p>
</caption>
<graphic xlink:href="fdata-03-608043-g007.tif"/>
</fig>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>The ratio of vertices/edges of the compressed graphs to that of the original graphs. (Citeseer <bold>(A)</bold>, Wiki <bold>(B)</bold>, DBLP <bold>(C)</bold> and BlogCatalog <bold>(D)</bold>).</p>
</caption>
<graphic xlink:href="fdata-03-608043-g008.tif"/>
</fig>
<p>
<bold>Gain on baseline methods:</bold> For all datasets, we present macro <inline-formula id="inf108">
<mml:math id="mml-math108-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf109">
<mml:math id="mml-math109-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores for single and multi-label classification tasks and embedding time in <xref ref-type="table" rid="T1">Table 1</xref> and compression ratio for edge and vertices in <xref ref-type="table" rid="T2">Table 2</xref>. We use 5% training ratio of labeled vertices for Citeseer, Wiki, and DBLP and 40% training ratio for BlogCatalog. As we see from <xref ref-type="table" rid="T1">Table 1</xref>, for DeepWalk, there is a significant gain on macro and micro <inline-formula id="inf110">
<mml:math id="mml-math110-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in addition to gain on efficiency on Citeseer, Wiki, and DBLP. For Node2vec and LINE, while there is a significant gain on total embedding time as efficiency, there is no (significant) difference between NECL and baseline methods on macro <inline-formula id="inf111">
<mml:math id="mml-math111-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf112">
<mml:math id="mml-math112-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> For LINE, we have a higher gain on time for all datasets.</p>
<p>For DBLP, gains of embedding time are much higher than other datasets. On the other hand, for BlogCatalog, gains of embedding times are less with respect to other datasets. As we see from the <xref ref-type="table" rid="T1 T2">Tables 1, 2</xref>, the gain of embedding time depends on the compression ratio of the number of edges and vertices. With compression, the number of vertices and edges for DBLP decrease from 27,199 to 8,824 (70%) and from 13,3664 to 32,984 (75%), respectively. Therefore, embedding becomes more efficient with better or same accuracy. For BlogCatalog, the compression ratio is lower than the others, around 18%; therefore, the time gain is also lower. The reason for this is that, in DBLP, vertices have many common neighbors, so the neighborhood similarity is higher and this results in more compression. On the other hand, in BlogCatalog, vertices have less common neighbors and so a lower similarity, and this results in less compression. We can conclude that while the gain in the effectiveness of our method depends on the baseline method, the gain in efficiency of our method depends on the characteristics of the dataset.</p>
<p>
<bold>Detailed Analyses:</bold> We compare the performance of NECL framework for different similarity threshold values &#x3bb; that results in different compression ratios with the performance of the baseline methods. Macro <inline-formula id="inf113">
<mml:math id="mml-math113-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf114">
<mml:math id="mml-math114-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores on different datasets are given on <xref ref-type="fig" rid="F3">Figures 3</xref>&#x2013;<xref ref-type="fig" rid="F6">6</xref> for Citeseer, Wiki, DBLP and BlogCatalog datasets, respectively. We observe that for <inline-formula id="inf115">
<mml:math id="mml-math115-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.45</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, macro <inline-formula id="inf116">
<mml:math id="mml-math116-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and micro <inline-formula id="inf117">
<mml:math id="mml-math117-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores for NECL are similar with or higher than baseline methods across all datasets except Citeseer. For <inline-formula id="inf118">
<mml:math id="mml-math118-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.45</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the quality of embedding decreases dramatically and so does the accuracy of classification. The results for Citeseer depend on the baseline methods. While <inline-formula id="inf119">
<mml:math id="mml-math119-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.45</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> gives better accuracy for DeepWalk and Node2vec, it gives worse for LINE.</p>
<p>In addition to the macro <inline-formula id="inf120">
<mml:math id="mml-math120-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf121">
<mml:math id="mml-math121-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores, we also report the embedding time and compression ratio for different similarity threshold values &#x3bb; in <xref ref-type="fig" rid="F7">Figures 7</xref>, <xref ref-type="fig" rid="F8">8</xref>. From the figures, we see that NECL takes significantly less time compared to the baseline method. As expected, for a lower threshold value &#x3bb;, the compression ratio increases, and we get a smaller compressed graph and so the embedding time decreases. As BlogCatalog has a lower compassion ratio, the embedding time is less for all three baseline methods. We observe that there is a linear relation between &#x3bb; and the number of vertices and edges until <inline-formula id="inf122">
<mml:math id="mml-math122-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. After this point, graph sizes change dramatically for smaller &#x3bb; for Citeseer, Wiki, and DBLP, but the decrease is slow for BlogCatalog until <inline-formula id="inf123">
<mml:math id="mml-math123-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.7</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. One of the reasons for this situation in BlogCatalog is that the sizes of the neighbor sets for some vertices are very large, and it is not easy to get higher similarity for a larger set. For example, for two vertices with 15 edges, 10 common neighbors can be considered to have a higher similarity. On the other hand, two vertices with 150 edges should have 100 common neighbors to get the same similarity value, which is not very common.</p>
<p>From these detailed analyses, we observe that smaller &#x3bb; results in smaller compressed graph. As a result, embedding becomes more efficient. However, for <inline-formula id="inf124">
<mml:math id="mml-math124-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.45</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, we start to lose critical information about the graph, hence, while efficiency increases, effectiveness decreases dramatically. As a solution to this problem, we refine our results with our second method, NECL-RF.</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title> Comparisons of all Methods</title>
<p>In this section, we evaluate the effectiveness of our NECL-RF method and compare the results with NECL, HARP, and baseline methods. From the analysis of NECL, we can see that <inline-formula id="inf125">
<mml:math id="mml-math125-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> similarity threshold value gives the best result in terms of efficiency and effectiveness. For this reason, we decide to use the compressed graph for <inline-formula id="inf126">
<mml:math id="mml-math126-fdata.2020.608043">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> threshold value and get the embedding for the compressed graph. We present the macro <inline-formula id="inf127">
<mml:math id="mml-math127-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf128">
<mml:math id="mml-math128-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores achieved on all datasets in <xref ref-type="table" rid="T1 T2">Tables 1, 2</xref>. We use <inline-formula id="inf129">
<mml:math id="mml-math129-fdata.2020.608043">
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mtext>%</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> of the labeled vertices for Citeseer, Wiki, and DBLP, <inline-formula id="inf130">
<mml:math id="mml-math130-fdata.2020.608043">
<mml:mrow>
<mml:mn>40</mml:mn>
<mml:mtext>%</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> for BlogCatalog as training data. To have a detailed comparison between our models, NECL and NECL-RF, HARP and the baseline methods, we vary the fraction of labeled data for classification, and present macro <inline-formula id="inf131">
<mml:math id="mml-math131-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf132">
<mml:math id="mml-math132-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores in <xref ref-type="fig" rid="F9">Figures 9</xref>&#x2013;<xref ref-type="fig" rid="F12">12</xref>.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Detailed comparisons of classification results on Citeseer</p>
</caption>
<graphic xlink:href="fdata-03-608043-g009.tif"/>
</fig>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Detailed comparisons of classification results on Wiki.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g010.tif"/>
</fig>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Detailed comparisons of classification results on DBLP.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g011.tif"/>
</fig>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Detailed comparisons of classification results on BlogCatalog.</p>
</caption>
<graphic xlink:href="fdata-03-608043-g012.tif"/>
</fig>
<p>In <xref ref-type="table" rid="T3">Table 3</xref>, we see that NECL or NECL-RF gives the highest macro <inline-formula id="inf133">
<mml:math id="mml-math133-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and micro <inline-formula id="inf134">
<mml:math id="mml-math134-fdata.2020.608043">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> scores for datasets with all baseline methods except for LINE on Wiki. For DBLP, NECL or NECL-RF gives the highest accuracy for all the three baseline models. NECL-RF significantly improves the quality of the embedding for all datasets except Citeseer with Node2vec and Wiki with LINE.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Performance comparisons of all methods.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left"/>
<th colspan="2" align="center">Citeseer</th>
<th colspan="2" align="center">Wiki</th>
<th colspan="2" align="center">DBLP</th>
<th colspan="2" align="center">BlogCatalog</th>
</tr>
<tr>
<td align="left"/>
<td align="center">Macro F<sub>1</sub>
</td>
<td align="center">Micro F<sub>1</sub>
</td>
<td align="center">Macro F<sub>1</sub>
</td>
<td align="center">Micro F<sub>1</sub>
</td>
<td align="center">Macro F<sub>1</sub>
</td>
<td align="center">Micro F<sub>1</sub>
</td>
<td align="center">Macro F<sub>1</sub>
</td>
<td align="center">Micro F<sub>1</sub>
</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Baseline (DW)</td>
<td align="center">0.408</td>
<td align="center">0.440</td>
<td align="center">0.373</td>
<td align="center">0.483</td>
<td align="center">0.603</td>
<td align="center">0.635</td>
<td align="center">0.245</td>
<td align="center">0.387</td>
</tr>
<tr>
<td align="left">HARP (DW)</td>
<td align="center">0.422</td>
<td align="center">0.453</td>
<td align="center">0.366</td>
<td align="center">0.483</td>
<td align="center">0.612</td>
<td align="center">0.644</td>
<td align="center">0.253</td>
<td align="center">0.390</td>
</tr>
<tr>
<td align="left">NECL (DW)</td>
<td align="center">
<bold>0.434</bold>
</td>
<td align="center">
<bold>0.469</bold>
</td>
<td align="center">0.390</td>
<td align="center">0.497</td>
<td align="center">
<bold>0.625</bold>
</td>
<td align="center">
<bold>0.656</bold>
</td>
<td align="center">0.246</td>
<td align="center">0.388</td>
</tr>
<tr>
<td align="left">NECL-RF (DW)</td>
<td align="center">0.422</td>
<td align="center">0.457</td>
<td align="center">
<bold>0.408</bold>
</td>
<td align="center">
<bold>0.549</bold>
</td>
<td align="center">0.617</td>
<td align="center">0.649</td>
<td align="center">
<bold>0.285</bold>
</td>
<td align="center">
<bold>0.405</bold>
</td>
</tr>
<tr>
<td align="left">Baseline (N2V)</td>
<td align="center">0.437</td>
<td align="center">0.472</td>
<td align="center">0.348</td>
<td align="center">0.490</td>
<td align="center">0.624</td>
<td align="center">0.653</td>
<td align="center">0.251</td>
<td align="center">0.389</td>
</tr>
<tr>
<td align="left">HARP (N2V)</td>
<td align="center">0.432</td>
<td align="center">0.466</td>
<td align="center">0.352</td>
<td align="center">0.492</td>
<td align="center">0.626</td>
<td align="center">0.656</td>
<td align="center">0.259</td>
<td align="center">0.394</td>
</tr>
<tr>
<td align="left">NECL (N2V)</td>
<td align="center">
<bold>0.439</bold>
</td>
<td align="center">
<bold>0.475</bold>
</td>
<td align="center">0.349</td>
<td align="center">0.489</td>
<td align="center">0.626</td>
<td align="center">0.657</td>
<td align="center">0.252</td>
<td align="center">0.391</td>
</tr>
<tr>
<td align="left">NECL-RF (N2V)</td>
<td align="center">0.430</td>
<td align="center">0.464</td>
<td align="center">
<bold>0.372</bold>
</td>
<td align="center">
<bold>0.513</bold>
</td>
<td align="center">
<bold>0.628</bold>
</td>
<td align="center">
<bold>0.661</bold>
</td>
<td align="center">
<bold>0.260</bold>
</td>
<td align="center">
<bold>0.398</bold>
</td>
</tr>
<tr>
<td align="left">Baseline (LINE)</td>
<td align="center">0.320</td>
<td align="center">0.359</td>
<td align="center">
<bold>0.369</bold>
</td>
<td align="center">
<bold>0.518</bold>
</td>
<td align="center">0.593</td>
<td align="center">0.645</td>
<td align="center">0.219</td>
<td align="center">0.373</td>
</tr>
<tr>
<td align="left">HARP (LINE)</td>
<td align="center">0.430</td>
<td align="center">0.494</td>
<td align="center">0.322</td>
<td align="center">0.396</td>
<td align="center">0.594</td>
<td align="center">0.643</td>
<td align="center">0.228</td>
<td align="center">0.373</td>
</tr>
<tr>
<td align="left">NECL (LINE)</td>
<td align="center">0.317</td>
<td align="center">0.355</td>
<td align="center">0.355</td>
<td align="center">0.517</td>
<td align="center">0.595</td>
<td align="center">0.649</td>
<td align="center">0.215</td>
<td align="center">0.369</td>
</tr>
<tr>
<td align="left">NECL-RF (LINE)</td>
<td align="center">
<bold>0.444</bold>
</td>
<td align="center">
<bold>0.513</bold>
</td>
<td align="center">0.353</td>
<td align="center">0.493</td>
<td align="center">
<bold>0.619</bold>
</td>
<td align="center">
<bold>0.661</bold>
</td>
<td align="center">
<bold>0.252</bold>
</td>
<td align="center">
<bold>0.377</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>While HARP has higher accuracy than baseline methods, it does multiple levels of iteration of graph coarsening and representation learning, so it increases the time complexity. On the other hand, we do iteration only one level in NECL-RF. Embedding time for NECL-RF is the total of embedding time for the original graph and compressed graph. As we see in the previous section, the compressed graph is much smaller than an original graph, so the learning time for the compressed graph is significantly less compare to the baseline method. Hence, complexity does not increase significantly as in HARP. As a result, we get similar or better effectiveness than HARP with less time complexity.</p>
<p>Detailed comparisons between all methods using different portions of labeled vertices as training data are presented in <xref ref-type="fig" rid="F9">Figures 9</xref>&#x2013;<xref ref-type="fig" rid="F12">12</xref>. In most cases, we see that in most of the cases, NECL and NECL-RF give the highest accuracy compared to other models or give better results than the baseline models. We observe that, for some datasets, refinement decreases the accuracy of NECL. The reason for this decrease might be that, for some classification tasks, learning a global structure with compressed data, which also includes a local structure in the super-nodes, would be enough. So when we relearn and update the embedding of the compressed graph, it might add noise to the features. As a result, it deteriorates the accuracy of the classification task. Also, as we see from the figures, our method has a better improvement on DeepWalk. The reason is that while Node2vec and LINE may learn higher-order proximity, regular random walk in DeepWalk may not capture higher-order proximity, so it loses the global information. It also depends on the datasets.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title> Conclusion</title>
<p>We present a novel method for network embedding that preserves the local and global structure of the network. To capture the global structure and accelerate the efficiency of state-of-the-art methods, we introduce a neighborhood similarity-based graph compression method. We combine the vertices with common neighbors into super-node. Then we apply network representation learning on the compressed graph so that we can reduce the run time and also capture the global structure. As a first method, we project the embedding of super-nodes to original nodes without refinement. In the second part, we relearn the representation of the network with assigning the super-nodes embedding to its&#x2019; original vertices as initial features and update this using the baseline method. In this way, we combine the local structure with the global structure of the network. While the first method provides efficiency with learning on the small compressed graph, the second method provides effectiveness with incorporating global information into embedding with the compressed graph. NECL and NECL-RF are a general meta-strategies that can be used to improve the efficiency and effectiveness of many state-of-the-art graph embedding method. We use three popular state-of-the-art network embedding methods DeepWalk, Node2vec, and LINE as a baseline. Experimental results on various real-world graph show the effectiveness and efficiency of our methods on challenging multi-label and multi-class classification tasks for all these three baseline methods.</p>
<p>The future work of our NECL and NECL-RF could be using different refinement methods of graph embedding. We can apply different neural network models without relearning the whole network to refine the embedding which we get from the compressed graph. Another extension could be done by using different clustering methods or similarity measurements to compressed the graph and use other baseline methods.</p>
</sec>
<sec id="s7">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/Supplementary Material, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="s8">
<title>Author Contributions</title>
<p>Conceived and designed the experiments: MI and EA Performed the experiments: MI , GJ, and EA. Analyzed the data: MI, GJ, EA, FT, and MA. Wrote the paper: MI, FT, GJ, EA, and MA.</p>
</sec>
<sec id="s9">
<title>Funding</title>
<p>This work was partially done by the author Ginger Johnson while she attended the Big Data Analytics REU program at Oklahoma State University supported by the National Science Foundation under Grant No. 1659645.</p>
</sec>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
</body>
<back>
<ack>
<p>The content of this manuscript has been presented in part at the Big Data conference (<xref ref-type="bibr" rid="B3">Akbas and Aktas, 2019a</xref>). Earlier version of this manuscript has been released as a pre-print at Arxiv (<xref ref-type="bibr" rid="B4">Akbas and Aktas, 2019b</xref>).</p>
</ack>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Adler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mitzenmacher</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2001</year>). &#x201c;<article-title>Towards compressing web graphs</article-title>,&#x201d; in <conf-name>Proceedings of data compression conference Snowbird, UT, USA, March 27-29, 2001</conf-name> (<publisher-loc>DCC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>203</fpage>&#x2013;<lpage>212</lpage>. </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akbas</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Attributed graph clustering: an attribute-aware graph embedding approach</article-title>,&#x201d; in <conf-name>Proceedings of the 2017 IEEE/ACM international conference on advances in social networks analysis and mining, Sydney, Australia, July, 2017</conf-name>, <fpage>305</fpage>&#x2013;<lpage>308</lpage>. </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akbas</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Aktas</surname>
<given-names>M. E.</given-names>
</name>
</person-group> (<year>2019a</year>). &#x201c;<article-title>Network embedding: on compression and learning</article-title>,&#x201d; in <conf-name>IEEE international conference on Big data (Big data), Los Angeles, CA, USA, December 9-12, 2019</conf-name>, <fpage>4763</fpage>&#x2013;<lpage>4772</lpage>. </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akbas</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Aktas</surname>
<given-names>M. E.</given-names>
</name>
</person-group> (<year>2019b</year>). <article-title>Network embedding: on compression and learning</article-title>. <comment>arXiv preprint arXiv:1907.02811</comment> </citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Akbas</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Graph clustering based on attribute-aware graph embedding</article-title>,&#x201d; in <source>From security to community detection in social networking platforms</source> (<publisher-name>Springer International Publishing</publisher-name>), <fpage>109</fpage>&#x2013;<lpage>131</lpage>. </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ayan Kumar Bhowmick</surname>
<given-names>M. D.</given-names>
</name>
<name>
<surname>Meneni</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Louvainne: hierarchical louvain method for high quality and scalable network embedding</article-title>,&#x201d; in <conf-name>WSDM, Houston, Texas, USA, February 4-6, 2020</conf-name>. </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Belkin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Niyogi</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2001</year>). &#x201c;<article-title>Laplacian eigenmaps and spectral techniques for embedding and clustering</article-title>,&#x201d; in <conf-name>Proceedings of the 14th international conference on neural information processing systems: natural and synthetic, NIPS&#x2019;01, Vancouver, British Columbia, Canada, Dec 3&#x2010;8, 2001</conf-name>, <fpage>585</fpage>&#x2013;<lpage>591</lpage>. </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cai</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>V. W.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>K. C.-C.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A comprehensive survey of graph embedding: problems, techniques, and applications</article-title>. <source>IEEE Trans. Knowl. Data Eng.</source> <volume>30</volume>, <fpage>1616</fpage>&#x2013;<lpage>1637</lpage>. <pub-id pub-id-type="doi">10.1109/tkde.2018.2807452</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Grarep: learning graph representations with global structural information</article-title>,&#x201d; in <conf-name>Proceedings of the CIKM&#x2019;15, Melbourne Australia, October, 2015</conf-name>, <fpage>891</fpage>&#x2013;<lpage>900</lpage>. </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep neural networks for learning graph representations</article-title>,&#x201d; in <conf-name>Thirtieth AAAI conference on artificial intelligence, Phoenix, Arizona, USA, February 12-17, 2016</conf-name>. </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Perozzi</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Al-Rfou</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Skiena</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018a</year>). <article-title>A tutorial on network embeddings</article-title>. <comment>arXiv preprint arXiv:1808.02590</comment> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Perozzi</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Skiena</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018b</year>). &#x201c;<article-title>Harp: hierarchical representation learning for networks</article-title>,&#x201d; in <conf-name>Thirty-second AAAI Conference on artificial intelligence New Orleans, Louisiana, USA, February 2 -7, 2018</conf-name>. </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chenhui Deng</surname>
<given-names>Y. W. Z. Z. Z. F.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Graphzoom: a multi-level spectral approach for accurate and scalable graph embedding</article-title>,&#x201d; in <conf-name>ICLR 2020, Addis Ababa ETHIOPIA, Apr 26- 1st May</conf-name>. </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Pei</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>A survey on network embedding</article-title>,&#x201d; in <conf-name>IEEE transactions on knowledge and data engineering</conf-name>. </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname>
<given-names>R.-E.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>K.-W.</given-names>
</name>
<name>
<surname>Hsieh</surname>
<given-names>C.-J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.-R.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>C.-J.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Liblinear: a library for large linear classification</article-title>. <source>J. Mach. Learn. Res</source>. <volume>9</volume>, <fpage>1871</fpage>&#x2013;<lpage>1874</lpage>. <pub-id pub-id-type="doi">10.1145/1390681.1442794</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gilbert</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Levchenko</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Compressing network graphs</article-title>,&#x201d; in <conf-name>Proceedings of the LinkKDD workshop at the KDD&#x2019;10, 124, Washington DC, USA, July, 2010</conf-name>. </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goyal</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ferrara</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Graph embedding techniques, applications, and performance: a survey</article-title>. <source>Knowl. Base Syst</source>. <volume>151</volume>, <fpage>78</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2018.03.022</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Grover</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Leskovec</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>node2vec: scalable feature learning for networks</article-title>,&#x201d; in <conf-name>KDD proceedings of the 22nd ACM SIGKDD</conf-name> (<publisher-name>San Francisco, CA, USA:ACM</publisher-name>), <fpage>855</fpage>&#x2013;<lpage>864</lpage>. </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hamilton</surname>
<given-names>W. L.</given-names>
</name>
<name>
<surname>Ying</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Leskovec</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Representation learning on graphs: methods and applications</article-title>. <comment>arXiv preprint arXiv:1709.05584</comment> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gurukar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Parthasarathy</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Mile: a multi-level framework for scalable graph embedding</article-title>. <comment>arXiv preprint arXiv:1802.09612</comment> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mikolov</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Corrado</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Dean</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2013a</year>). &#x201c;<article-title>Efficient estimation of word representations in vector space</article-title>,&#x201d; in <conf-name>Proceedings of workshop at ICLR, Scottsdale, Arizona, May 2-4</conf-name>. </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mikolov</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Sutskever</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Corrado</surname>
<given-names>G. S.</given-names>
</name>
<name>
<surname>Dean</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2013b</year>). &#x201c;<article-title>Distributed representations of words and phrases and their compositionality</article-title>,&#x201d; in <conf-name>Advances in neural information processing systems, CA, USA, December 5-10, 2013</conf-name>, <fpage>3111</fpage>&#x2013;<lpage>3119</lpage>. </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niepert</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ahmed</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kutzkov</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Learning convolutional neural networks for graphs</article-title>,&#x201d; in <conf-name>Proceedings of the ICML&#x2019;16, New York, USA, June 19-24, 2016</conf-name>, <fpage>2014</fpage>&#x2013;<lpage>2023</lpage>. </citation>
</ref>
<ref id="B24">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ou</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pei</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Asymmetric transitivity preserving graph embedding</article-title>,&#x201d; in <conf-name>Proceedings of the KDD&#x2019;16</conf-name> (<publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>1105</fpage>&#x2013;<lpage>1114</lpage>. </citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Perozzi</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Al-Rfou</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Skiena</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Deepwalk: online learning of social representations</article-title>,&#x201d; in <conf-name>Proceedings of the SIGKDD&#x2019;14</conf-name> (<publisher-name>New York city, New york, USA: ACM</publisher-name>), <fpage>701</fpage>&#x2013;<lpage>710</lpage>. </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qiu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Network embedding as matrix factorization: unifying deepwalk, line, pte, and node2vec</article-title>,&#x201d; in <conf-name>Proceedings of the WSDM&#x2019;18, Los Angeles, California, USA, Feb 5-9, 2018</conf-name>, <fpage>459</fpage>&#x2013;<lpage>467</lpage>. </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roweis</surname>
<given-names>S. T.</given-names>
</name>
<name>
<surname>Saul</surname>
<given-names>L. K.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>Nonlinear dimensionality reduction by locally linear embedding</article-title>. <source>Science</source> <volume>290</volume>, <fpage>2323</fpage>&#x2013;<lpage>2326</lpage>. <pub-id pub-id-type="doi">10.1126/science.290.5500.2323</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Suel</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2001</year>). &#x201c;<article-title>Compressing the graph structure of the web</article-title>,&#x201d; in <conf-name>Proceedings of the data compression conference, Snowbird, UT, USA, March 27-29, 2001</conf-name>, <fpage>213</fpage>&#x2013;<lpage>222</lpage>. </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bandyopadhyay</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Bashizade</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sadayappan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Parthasarathy</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Atp: directed graph embedding with asymmetric transitivity preservation</article-title>. <source>Proc. AAAI Conf. Artif. Intell.</source> <volume>33</volume>, <fpage>265</fpage>&#x2013;<lpage>272</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v33i01.3301265</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Qu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Mei</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Line: large-scale information network embedding</article-title>,&#x201d; in <conf-name>Proceedings of the WWW&#x2019;15, Florence, Italy, May 18-22, 2015</conf-name>, <fpage>1067</fpage>&#x2013;<lpage>1077</lpage>. </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tsitsulin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mottin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Karras</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>M&#xfc;ller</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Verse: versatile graph embeddings from similarity measures</article-title>,&#x201d; in <conf-name>Proceedings of the WWW&#x2019;18, Lyon, France, April 23, 2018</conf-name>, <fpage>539</fpage>&#x2013;<lpage>548</lpage>. </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>Graphgan: graph representation learning with generative adversarial nets</article-title>,&#x201d; in <conf-name>Thirty-Second AAAI Conference on Artificial Intelligence</conf-name>, <comment>New Orleans, Louisiana, February 2-7, 2018</comment>. </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wold</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Esbensen</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Geladi</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>1987</year>). <article-title>Principal component analysis</article-title>. <source>Chemometr. Intell. Lab. Syst.</source> <volume>2</volume>, <fpage>37</fpage>&#x2013;<lpage>52</lpage> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ying</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>You</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Morris</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hamilton</surname>
<given-names>W. L.</given-names>
</name>
<name>
<surname>Leskovec</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Hierarchical graph representation learning with differentiable pooling</article-title>,&#x201d; in <conf-name>Proceedings of the NIPS&#x2019;18, Montr&#x00e8;al, CANADA, Jan 2-8, 2018</conf-name>, <fpage>4805</fpage>&#x2013;<lpage>4815</lpage>. </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Network representation learning: a survey</article-title>,&#x201d; in <conf-name>IEEE Transactions on Big Data</conf-name>, <fpage>1</fpage>. </citation>
</ref>
</ref-list>
</back>
</article>
