#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <dirent.h>
#include <pthread.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <string.h>
void* copyFile(void* arg) {
char *filename = ((char*)arg);
char *destname = strcat(filename, "copy");
int in = 0;
in = open(filename, O_RDONLY);
int out = 0;
out = open(destname, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
struct stat statbuf;
fstat(in, &statbuf);
off_t insize = statbuf.st_size;
lseek(out, insize - 1, SEEK_SET);
ssize_t wrote = write(out, "", 1);
char* inmap = mmap(0, insize, PROT_READ, MAP_FILE | MAP_PRIVATE, in, 0);
char* outmap = mmap(0, insize, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, out, 0);
memcpy(outmap, inmap, insize);
close(out);
}
int main(int argc, char *argv[]) {
DIR* d;
struct dirent* e;
d = opendir(argv[1]);
int num_entries = 0;
int i = 0;
pthread_t threads[50];
while((e = readdir(d)) != NULL) {
char *filename = e->d_name;
printf("%s\n", filename);
pthread_create(&threads[i], NULL, copyFile, filename);
i ;
}
for(int j = 0; j < i; j ) {
pthread_join(threads[j], NULL);
}
}
Hello, I am attempting to use mmap and memcpy to copy an entire directory of files using multithreading. I've got the code to copy the files in the directory, however it is forcing all COPIED files to be 1 Byte in size. I'm curious as to why this is happening and what can be done to resolve this bug.
I am very much a novice at C so please forgive my obvious semantic errors. I appreciate the assistance!
CodePudding user response:
There are a number of errors.
In
main,filenamewill point to the same address for all files. We must provide a unique address/copy. One way is to usestrdup. And, we should add afreecall at the bottom ofcopyFileto release the storage allocated by thestrdup.We must skip over
.and..when copying. Otherwise, the code segfaults.Using the same directory for input files and the copy can cause the equivalent of an infinite loop. That is, given a file
xyzthat is copied toxyzcopy, thereaddirmay be able to seexyzcopyas input and try to createxyzcopycopy. Better to use a separate output directory. That would require a major rewrite. But, a simpler fix is to skip any file that containscopy.The program will [try to] copy directory entries even if they are not ordinary files (e.g. it would try to do
openon a directory). We should check the file type and skip non-file types.In
copyFile, even with thestrdupinmain, the stringfilenamedoes not have enough space to allow for thestrcatof"copy". We need a separate buffer for the output filename.The
lseek/writeadds a superfluous binary zero at the end of the output file. This is to extend the output file. Better to useftruncatefor that.That's because intermixing
writewithmmapcan be problematic and is usually not the best practice. In your use case, it may be okay (because thewriteis done before themmap), but if we go to the trouble to usemmap, it's better to just use themmapbuffer pointer andmemcpy.copyFileis missing areturn. And,mainis also missing areturnThere is no
closeforinincopyFile.There is no error checking on
open,mmap,opendircalls.There should be
munmapcalls before theclosecalls.
Bugs not fixed:
The program is limited in how many entries the directory can have.
With
pthread_t threads[50];we can only have 50 entries in the directory. There is no check for overflow of this array.It makes no practical sense to do all files in parallel. This does not scale.
If we had (e.g.) 1,000,000 entries in the directory, we could run out of resources. We could run out of unused process slots and
pthread_createwould start to fail after a system wide limit is reached. We could run out of file descriptors and theopencalls would begin to fail after approximately 1024 open files.After a set number of threads is created (e.g. in practice, 4-5), the copying process would actually be slower because the system would spend most of its time switching between the threads rather than doing useful work.
For small files, the overhead of thread creation/teardown becomes significant relative to the running time of the thread.
It would be better to have a limited "pool" of "worker" threads, and intersperse the pthread_create calls with pthread_join calls, reclaiming/reusing the now unused/free "slot" when a thread completes.
A more efficient way would be to create the pool of threads (via pthread_create calls once at the start. Each thread would have a mailbox/queue of work to do. main could enqueue the entries to the threads, adding a new entry when it sees a thread become idle. The pthread_join calls could be done once at the end
Below is the corrected code.
I used preprocessor conditionals to denote old vs. new code (e.g.):
#if 0
// old code
#else
// new code
#endif
#if 1
// new code
#endif
Anyway, here is the corrected code. I've annotated the bugs/fixes with comments:
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <dirent.h>
#include <pthread.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <string.h>
void *
copyFile(void *arg)
{
// NOTE/BUG: no cast is needed from a void *
#if 0
char *filename = ((char *) arg);
#else
char *filename = arg;
#endif
// NOTE/BUG: there is _not_ enough space in filename to add "copy" -- this is
// UB (undefined behavior)
#if 0
char *destname = strcat(filename, "copy");
#else
char destname[1024];
strcpy(destname,filename);
strcat(destname,"copy");
#endif
int in = 0;
in = open(filename, O_RDONLY);
// NOTE/FIX: check for error
#if 1
if (in < 0) {
perror(filename);
exit(1);
}
#endif
int out = 0;
out = open(destname, O_RDWR | O_CREAT | O_TRUNC,
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
// NOTE/FIX: check for error
#if 1
if (out < 0) {
perror(destname);
exit(1);
}
#endif
struct stat statbuf;
fstat(in, &statbuf);
off_t insize = statbuf.st_size;
// NOTE/BUG: this will add a binary zero at the end of the output file
#if 0
lseek(out, insize - 1, SEEK_SET);
ssize_t wrote = write(out, "", 1);
#else
ftruncate(out,insize);
#endif
char *inmap = mmap(0, insize, PROT_READ, MAP_FILE | MAP_PRIVATE, in, 0);
// NOTE/FIX: check for error
#if 1
if (inmap == MAP_FAILED) {
perror("mmap/in");
exit(1);
}
#endif
char *outmap = mmap(0, insize, PROT_READ | PROT_WRITE,
MAP_FILE | MAP_SHARED, out, 0);
// NOTE/FIX: check for error
#if 1
if (inmap == MAP_FAILED) {
perror("mmap/in");
exit(1);
}
#endif
memcpy(outmap, inmap, insize);
// NOTE/FIX: we should unamp the output area
#if 1
munmap(outmap,insize);
#endif
close(out);
// NOTE/FIX: we should unmap and close the input descriptor
#if 1
munmap(inmap,insize);
close(in);
#endif
// NOTE/FIX: free the string allocated by strdup in main
#if 1
free(filename);
#endif
// NOTE/FIX: needs return value -- would be flagged by compiler if compiled
// with -Wall
#if 1
return (void *) 0;
#endif
}
int
main(int argc, char *argv[])
{
DIR *d;
struct dirent *e;
// NOTE/BUG: no checks for missing argument or bad directory
#if 0
d = opendir(argv[1]);
#else
if (argv[1] == NULL) {
fprintf(stderr,"missing argument\n");
exit(1);
}
d = opendir(argv[1]);
if (d == NULL) {
perror(argv[1]);
exit(1);
}
#endif
// NOTE/BUG: unused variable
#if 0
int num_entries = 0;
#endif
int i = 0;
pthread_t threads[50];
while ((e = readdir(d)) != NULL) {
// NOTE/FIX: must skip "." and ".."
#if 1
if (strcmp(e->d_name,".") == 0)
continue;
if (strcmp(e->d_name,"..") == 0)
continue;
#endif
// NOTE/FIX: only copy simple files (i.e. do _not_ copy subdirectories, etc.)
#if 1
if (e->d_type != DT_REG)
continue;
#endif
// NOTE/FIX: skip "copy" files
#if 1
if (strstr(e->d_name,"copy") != NULL)
continue;
#endif
// NOTE/BUG: this will present the _same_ memory address to all threads so there
// is a "race condition"
#if 0
char *filename = e->d_name;
#else
char *filename = strdup(e->d_name);
#endif
printf("%s\n", filename);
pthread_create(&threads[i], NULL, copyFile, filename);
i ;
}
for (int j = 0; j < i; j ) {
pthread_join(threads[j], NULL);
}
// NOTE/FIX: needs return
#if 1
return 0;
#endif
}
