diff -u b/Lib/test/test_file2k.py b/Lib/test/test_file2k.py --- b/Lib/test/test_file2k.py +++ b/Lib/test/test_file2k.py @@ -652,6 +652,33 @@ self.f.writelines('') self._test_close_open_io(io_func) + def test_iteration_torture(self): + # bpo-31530 + with open(self.filename, "wb") as fp: + for i in xrange(2**20): + fp.write(b"0"*50 + b"\n") + with open(self.filename, "rb") as f: + def it(): + for l in f: + pass + self._run_workers(it, 10) + + def test_iteration_seek(self): + # bpo-31530: Crash when concurrently seek and iterate over a file. + with open(self.filename, "wb") as fp: + for i in xrange(10000): + fp.write(b"0"*50 + b"\n") + with open(self.filename, "rb") as f: + it = iter([1] + [0]*10) # one thread reads, others seek + def iterate(): + if next(it): + for l in f: + pass + else: + for i in xrange(100): + f.seek(i*100, 0) + self._run_workers(iterate, 10) + @unittest.skipUnless(os.name == 'posix', 'test requires a posix system.') class TestFileSignalEINTR(unittest.TestCase): unchanged: --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2017-09-20-18-28-09.bpo-31530.CdLOM7.rst @@ -0,0 +1,4 @@ +Fixed crashes when iterating over a file on multiple threads. +seek() and next() methods of file objects now raise an exception during +concurrent operation on the same file object. +A lock can be used to prevent the error. diff -u b/Objects/fileobject.c b/Objects/fileobject.c --- b/Objects/fileobject.c +++ b/Objects/fileobject.c @@ -430,7 +430,7 @@ if (Py_REFCNT(f) > 0) { PyErr_SetString(PyExc_IOError, "close() called during concurrent " - "operation on the same file object."); + "operation on the same file object"); } else { /* This should not happen unless someone is * carelessly playing with the PyFileObject @@ -438,7 +438,7 @@ * pointer. */ PyErr_SetString(PyExc_SystemError, "PyFileObject locking error in " - "destructor (refcnt <= 0 at close)."); + "destructor (refcnt <= 0 at close)"); } return NULL; } @@ -609,7 +609,12 @@ return NULL; } -static void drop_readahead(PyFileObject *); +static void +drop_file_readahead(PyFileObject *f) +{ + PyMem_FREE(f->f_buf); + f->f_buf = NULL; +} /* Methods */ @@ -632,7 +637,7 @@ Py_XDECREF(f->f_mode); Py_XDECREF(f->f_encoding); Py_XDECREF(f->f_errors); - drop_readahead(f); + drop_file_readahead(f); Py_TYPE(f)->tp_free((PyObject *)f); } @@ -767,7 +772,7 @@ if (f->f_fp == NULL) return err_closed(); - drop_readahead(f); + drop_file_readahead(f); whence = 0; if (!PyArg_ParseTuple(args, "O|i:seek", &offobj, &whence)) return NULL; @@ -2236,12 +2241,16 @@ {0}, }; +typedef struct { + char *buf, *bufptr, *bufend; +} readaheadbuffer; + static void -drop_readahead(PyFileObject *f) +drop_readaheadbuffer(readaheadbuffer *rab) { - if (f->f_buf != NULL) { - PyMem_Free(f->f_buf); - f->f_buf = NULL; + if (rab->buf != NULL) { + PyMem_FREE(rab->buf); + rab->buf = NULL; } } @@ -2249,35 +2258,34 @@ (unless at EOF) and no more than bufsize. Returns negative value on error, will set MemoryError if bufsize bytes cannot be allocated. */ static int -readahead(PyFileObject *f, Py_ssize_t bufsize) +readahead(PyFileObject *f, readaheadbuffer *rab, Py_ssize_t bufsize) { Py_ssize_t chunksize; - if (f->f_buf != NULL) { - if( (f->f_bufend - f->f_bufptr) >= 1) + if (rab->buf != NULL) { + if ((rab->bufend - rab->bufptr) >= 1) return 0; else - drop_readahead(f); + drop_readaheadbuffer(rab); } - if ((f->f_buf = (char *)PyMem_Malloc(bufsize)) == NULL) { + if ((rab->buf = PyMem_MALLOC(bufsize)) == NULL) { PyErr_NoMemory(); return -1; } FILE_BEGIN_ALLOW_THREADS(f) errno = 0; - chunksize = Py_UniversalNewlineFread( - f->f_buf, bufsize, f->f_fp, (PyObject *)f); + chunksize = Py_UniversalNewlineFread(rab->buf, bufsize, f->f_fp, (PyObject *)f); FILE_END_ALLOW_THREADS(f) if (chunksize == 0) { if (ferror(f->f_fp)) { PyErr_SetFromErrno(PyExc_IOError); clearerr(f->f_fp); - drop_readahead(f); + drop_readaheadbuffer(rab); return -1; } } - f->f_bufptr = f->f_buf; - f->f_bufend = f->f_buf + chunksize; + rab->bufptr = rab->buf; + rab->bufend = rab->buf + chunksize; return 0; } @@ -2287,45 +2295,43 @@ logarithmic buffer growth to about 50 even when reading a 1gb line. */ static PyStringObject * -readahead_get_line_skip(PyFileObject *f, Py_ssize_t skip, Py_ssize_t bufsize) +readahead_get_line_skip(PyFileObject *f, readaheadbuffer *rab, Py_ssize_t skip, Py_ssize_t bufsize) { PyStringObject* s; char *bufptr; char *buf; Py_ssize_t len; - if (f->f_buf == NULL) - if (readahead(f, bufsize) < 0) + if (rab->buf == NULL) + if (readahead(f, rab, bufsize) < 0) return NULL; - len = f->f_bufend - f->f_bufptr; + len = rab->bufend - rab->bufptr; if (len == 0) - return (PyStringObject *) - PyString_FromStringAndSize(NULL, skip); - bufptr = (char *)memchr(f->f_bufptr, '\n', len); + return (PyStringObject *)PyString_FromStringAndSize(NULL, skip); + bufptr = (char *)memchr(rab->bufptr, '\n', len); if (bufptr != NULL) { bufptr++; /* Count the '\n' */ - len = bufptr - f->f_bufptr; - s = (PyStringObject *) - PyString_FromStringAndSize(NULL, skip + len); + len = bufptr - rab->bufptr; + s = (PyStringObject *)PyString_FromStringAndSize(NULL, skip + len); if (s == NULL) return NULL; - memcpy(PyString_AS_STRING(s) + skip, f->f_bufptr, len); - f->f_bufptr = bufptr; - if (bufptr == f->f_bufend) - drop_readahead(f); + memcpy(PyString_AS_STRING(s) + skip, rab->bufptr, len); + rab->bufptr = bufptr; + if (bufptr == rab->bufend) + drop_readaheadbuffer(rab); } else { - bufptr = f->f_bufptr; - buf = f->f_buf; - f->f_buf = NULL; /* Force new readahead buffer */ + bufptr = rab->bufptr; + buf = rab->buf; + rab->buf = NULL; /* Force new readahead buffer */ assert(len <= PY_SSIZE_T_MAX - skip); - s = readahead_get_line_skip(f, skip + len, bufsize + (bufsize>>2)); + s = readahead_get_line_skip(f, rab, skip + len, bufsize + (bufsize>>2)); if (s == NULL) { - PyMem_Free(buf); + PyMem_FREE(buf); return NULL; } memcpy(PyString_AS_STRING(s) + skip, bufptr, len); - PyMem_Free(buf); + PyMem_FREE(buf); } return s; } @@ -2343,7 +2349,30 @@ if (!f->readable) return err_mode("reading"); - l = readahead_get_line_skip(f, 0, READAHEAD_BUFSIZE); + { + /* + Multiple threads can enter this method while the GIL is released + during file read and wreak havoc on the file object's readahead + buffer. To avoid dealing with cross-thread coordination issues, we + cache the file buffer state locally and only set it back on the file + object when we're done. + */ + readaheadbuffer rab = {f->f_buf, f->f_bufptr, f->f_bufend}; + f->f_buf = NULL; + l = readahead_get_line_skip(f, &rab, 0, READAHEAD_BUFSIZE); + /* + Make sure the file's internal read buffer is cleared out. This will + only do anything if some other thread interleaved with us during + readahead. We want to drop any changeling buffer, so we don't leak + memory. We may lose data, but that's what you get for reading the same + file object in multiple threads. + */ + drop_file_readahead(f); + f->f_buf = rab.buf; + f->f_bufptr = rab.bufptr; + f->f_bufend = rab.bufend; + } + if (l == NULL || PyString_GET_SIZE(l) == 0) { Py_XDECREF(l); return NULL; @@ -2692,7 +2721,7 @@ } else { PyErr_SetString(PyExc_TypeError, - "argument must be an int, or have a fileno() method."); + "argument must be an int, or have a fileno() method"); return -1; } only in patch2: unchanged: --- a/Misc/NEWS.d/next/Core and Builtins/2017-09-20-18-28-09.bpo-31530.CdLOM7.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2017-09-20-18-28-09.bpo-31530.CdLOM7.rst @@ -1,4 +1 @@ Fixed crashes when iterating over a file on multiple threads. -seek() and next() methods of file objects now raise an exception during -concurrent operation on the same file object. -A lock can be used to prevent the error.