/*- * * Store on disk and search through a graph's shortest paths. * * Copyright (c) 2010, Diomidis Spinellis * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * To compile with GCC: * g++ -O -I /path_to_boost_include/ -L /path_to_boost_lib/ -l boost_thread -l boost_system -l boost_filesystem smap.cpp -o smap * * To compile with Microsoft Visual C/C++: * cl /Zi /EHsc /I path_to_boost smap.cpp -Fesmap.exe /link /LIBPATH:path_to_boost_lib * * $Id: smap.cpp,v 1.15 2010/09/05 08:15:58 dds Exp $ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace boost::interprocess; // Sufficient for the Wikipedia graph: 15M nodes const std::size_t FileSize = 8l * 1024 * 1024 * 1024; const std::size_t Elements = 20 * 1000 * 1000; class Node; //Typedefs of allocators and containers typedef managed_mapped_file::segment_manager SegmentManager; typedef allocator VoidAllocator; typedef allocator CharAllocator; typedef offset_ptr NodePtr; typedef allocator NodePtrAllocator; typedef slist Edges; typedef allocator EdgesAllocator; typedef basic_string, CharAllocator> CharString; // A graph node, suitable for performing a breadh-first search class Node { public: typedef enum {White, Gray, Black} Color; private: CharString name; // Node name Color color; // Color used during BFS NodePtr predecessor; // BFS predecessor Edges edges; // Node's edges public: //Since VoidAllocator is convertible to any other allocator, we can simplify //the initialization taking just one allocator for all inner containers. Node(const std::string &n, const VoidAllocator &voidAlloc) : name(n.begin(), n.end(), voidAlloc), color(White), predecessor(NULL), edges(voidAlloc) { } void addEdge(NodePtr p) { edges.push_front(p); } const CharString &getName() const { return name; } void setName(const std::string &n) { name = n.c_str(); } const Edges &getEdges() const { return edges; } void setColor(Color c) { color = c; } Color getColor() const { return color; } NodePtr getPredecessor() const { return predecessor; } void setPredecessor(NodePtr p) { predecessor = p; } friend class NodeCompare; bool operator !=(const Node &b) const { return name != b.name; } }; std::size_t hash_value(Node const& n) { boost::hash hasher; return hasher(n.getName()); } struct NodeEqual : std::binary_function { bool operator()(const Node &a, const Node &b) const { return a.getName() == b.getName(); } }; struct NodeCompare { bool operator() (const Node& lhs, const Node& rhs) const { return lhs.name < rhs.name; } }; //Definition of the set holding Node entries typedef allocator NodeAllocator; typedef boost::unordered_set, NodeEqual, NodeAllocator> Nodes; typedef Nodes::iterator NodesIter; /* * Read ^A-separated nodes from the inputFile, storing the graph structure * in the specified backingFile. */ static void readData(const char *backingFile, const char *inputFile) { std::ifstream in(inputFile, std::ios::binary); if (in.fail()) { perror(inputFile); exit(1); } boost::filesystem::remove_all(backingFile); managed_mapped_file segment(create_only, backingFile, FileSize); //An allocator convertible to any allocator type VoidAllocator allocInst (segment.get_segment_manager()); //Construct the memory map and fill it Nodes *entries = segment.construct("entries")(Elements, boost::hash(), NodeEqual(), allocInst); std::string line; // Reuse instead of constructing new Node n(std::string(), allocInst); // Progress indicator boost::progress_display showProgress(boost::filesystem::file_size(inputFile)); std::streampos pos = in.tellg(); // Loop through all lines, adding them to the graph while (std::getline(in, line)) { int split = line.find('\001'); if (split == std::string::npos) { std::cerr << "No separator: " << line << std::endl; continue; } n.setName(line.substr(0, split)); NodesIter from(entries->insert(n).first); n.setName(line.substr(split + 1)); NodesIter to(entries->insert(n).first); (const_cast(*from)).addEdge(const_cast(&*to)); // Update progress indicator showProgress += in.tellg() - pos; pos = in.tellg(); } } /* * Breadth first search from node from one node to another in a graph * of n nodes. * Return true if a path is found, false otherwise. */ static bool breadthFirstSearchFor(NodePtr from, NodePtr to, size_t n) { std::queue q; boost::progress_display progress(n); from->setColor(Node::Gray); q.push(from); while (!q.empty()) { NodePtr u = q.front(); q.pop(); const Edges edges = u->getEdges(); for (Edges::const_iterator j = edges.begin(); j != edges.end(); j++) if ((*j)->getColor() == Node::White) { (*j)->setColor(Node::Gray); (*j)->setPredecessor(u); ++progress; if (*j == to) return true; // Found q.push(*j); } u->setColor(Node::Black); } return false; // Not found } /* * Find a node with the specified name and return a pointer to it. * Exit the program with an error if no such node can be found. */ static Node *findNode(const Nodes *entries, const Node &n) { Nodes::const_iterator i = entries->find(n); if (i == entries->end()) { std::cerr << "Node [" << n.getName() << "] not found" << std::endl; exit(1); } return const_cast(&*i); } /* * Search and report the shortest graph path from "from" to "to" * The graph is stored in backingFile. */ static void searchData(const char *backingFile, const std::string &from, const std::string &to) { managed_mapped_file segment(open_copy_on_write, backingFile); //An allocator convertible to any allocator type VoidAllocator allocInst(segment.get_segment_manager()); // Obtain the previously saved entries Nodes *entries = segment.find("entries").first; NodePtr toPtr; bool found = breadthFirstSearchFor(findNode(entries, Node(from, allocInst)), toPtr = findNode(entries, Node(to, allocInst)), entries->size()); std::cout << std::endl; // End progress indicator if (!found) { std::cerr << "No path found" << std::endl; return; } // Report path in the right order std::list result; for (NodePtr p = toPtr; p != NULL; p = p->getPredecessor()) result.push_front(p->getName().c_str()); for (std::list ::const_iterator i = result.begin(); i != result.end(); i++) std::cout << *i << "\n"; } // Report program usage and exit static void usage() { std::cerr << "Usage: smap " << " -s backingFile word1 word2 | -r backingFile inputFile" << std::endl; exit(1); } int main(int argc, char *argv[]) { if (argc < 4 || *argv[1] != '-') usage(); switch (argv[1][1]) { case 'r': readData(argv[2], argv[3]); break; case 's': if (argc != 5) usage(); searchData(argv[2], argv[3], argv[4]); break; default: usage(); } } // vim: shiftwidth=2 smarttab