ModErn Text Analysis
META Enumerates Textual Applications
filesystem.h
Go to the documentation of this file.
1 
10 #ifndef META_FILESYSTEM_H_
11 #define META_FILESYSTEM_H_
12 
13 #include <string>
14 #include <fstream>
15 #include <vector>
16 #include <sys/stat.h>
17 #include "io/mmap_file.h"
18 #include "util/printing.h"
19 #include "util/progress.h"
20 #include "shim.h"
21 
22 namespace meta
23 {
24 namespace filesystem
25 {
26 
31 inline void delete_file(const std::string& filename)
32 {
33  remove(filename.c_str());
34 }
35 
41 inline void rename_file(const std::string& old_name,
42  const std::string& new_name)
43 {
44  rename(old_name.c_str(), new_name.c_str());
45 }
46 
52 inline bool make_directory(const std::string& dir_name)
53 {
54  return mkdir(dir_name.c_str(), 0755) == -1;
55 }
56 
61 inline bool file_exists(const std::string& filename)
62 {
63  FILE* f = fopen(filename.c_str(), "r");
64  if (f != nullptr)
65  {
66  fclose(f);
67  return true;
68  }
69  return false;
70 }
71 
77 inline uint64_t file_size(const std::string& filename)
78 {
79  if (!file_exists(filename))
80  return 0;
81 
82 #if META_IS_DARWIN
83  // Darwin is large file aware by default
84  struct stat st;
85  stat(filename.c_str(), &st);
86 #else
87  struct stat64 st;
88  stat64(filename.c_str(), &st);
89 #endif
90  return st.st_size;
91 }
92 
99 inline bool copy_file(const std::string& source, const std::string& dest)
100 {
101  if (!file_exists(source))
102  return false;
103 
104  // if file is larger than 128 MB, show copy progress
105  auto size = file_size(source);
106  uint64_t max_size = 1024UL * 1024UL * 1024UL * 128UL;
107  if (size > max_size)
108  {
109  printing::progress prog{"Copying file ", size};
110  std::ifstream source_file{source};
111  std::ofstream dest_file{dest};
112  uint64_t buf_size = 1024UL * 1024UL * 32UL; // 32 MB buffer
113  uint64_t total_processed = 0;
114  std::vector<char> buffer(buf_size);
115  while (source_file)
116  {
117  source_file.read(buffer.data(), buf_size);
118  auto processed = source_file.gcount();
119  total_processed += processed;
120  dest_file.write(buffer.data(), total_processed);
121  prog(processed);
122  }
123  prog.end();
124  }
125  // otherwise, copy the file normally
126  else
127  {
128  std::ifstream source_file{source, std::ios::binary};
129  std::ofstream dest_file{dest, std::ios::binary};
130  dest_file << source_file.rdbuf();
131  }
132 
133  return true;
134 }
135 
140 inline std::string file_text(const std::string& in_name)
141 {
142  std::ifstream infile{in_name};
143  std::ostringstream buf;
144  buf << infile.rdbuf();
145  return buf.str();
146 }
147 
154 inline uint64_t num_lines(const std::string& filename, char delimiter = '\n')
155 {
156  io::mmap_file file{filename};
157  uint64_t num = 0;
158 
159  printing::progress progress{" > Counting lines in file: ", file.size(), 500,
160  32 * 1024 * 1024};
161  for (uint64_t idx = 0; idx < file.size(); ++idx)
162  {
163  progress(idx);
164  if (file[idx] == delimiter)
165  ++num;
166  }
167 
168  return num;
169 }
170 }
171 }
172 
173 #endif
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
std::string filename(const std::string &path)
Definition: unit_test.h:114