python通过docx模块提取简历里面的邮件地址放统一存放

本帖最后由 icestick8586 于 2018-9-17 21:01 编辑

import os,shutil,docx,re,time
from win32com import client as wc
#从所有级联目录读取文件到指定目录内
def count_files(file_dir):
count=0
for p,d,f in os.walk(file_dir):
for c in f:
if c.split('.')[-1]=="doc":
count +=1
src_dir = os.path.join(p, c)
print(src_dir)
dst_dir = file_dir + "back"
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
shutil.copy(src_dir, dst_dir)
return count
#提取每个docx简历文档里面的邮箱地址,我们这里使用python-docx模块来解决pip install python-docx
def count_mail(file_dir,dst_file):
mail_list = []
for parent,dirctiory,files in os.walk(file_dir):
for f in files:
doc = docx.Document(os.path.join(parent,f))
pattern = re.compile(r'''([a-zA-Z0-9._%+-]+@[a-zA-Z0-9\t\s.-]+(\.[a-zA-Z0-9\t\s]{2,4}))''', re.VERBOSE)
for para in doc.paragraphs:
for groups in pattern.findall(para.text):
mail_list.append(groups[0].replace(" ","")+";")
with open(dst_file,'w')as f:
f.writelines(mail_list)
print("=====================邮件信息写入成功===================")
#由于python-docx模块只能处理docx后缀，我们需要处理doc后缀的文件，必须通过win32com模块来把doc后缀转换成docx
def docxTodoc(old_doc,new_doc):
word = wc.Dispatch('Word.Application')
for parent,directory,files in os.walk(old_doc):
for f in files:
doc = word.Documents.Open(os.path.join(parent,f)) # 目标路径下的文件
new_filepath=os.path.join(new_doc,f.split(".")[0]+".docx")
print(new_filepath)
doc.SaveAs(new_filepath, 12, False, "", True, "", False, False, False,False) # 转化后路径下的文件
doc.Close()
print(time.time())
word.Quit()
if __name__ == '__main__':
print(count_files(r"C:\Users\icestick\Desktop\51job_导出简历_20180917"))
count_mail(r"C:\Users\icestick\Desktop\new_doc",r"C:\Users\icestick\Desktop\test.txt" )
old_doc = r"C:\Users\icestick\Desktop\51job_导出简历_20180917" #需要把doc目录转成docx格式的原目录
new_doc = r"C:\Users\icestick\Desktop\new_doc" #需要把doc目录转成docx格式的目标目录
mail_extract = r"C:\Users\icestick\Desktop\test.txt" #邮箱提取好的文件
if not os.path.exists(new_doc):
os.mkdir(new_doc)
print("=====================目录创建成功======================")
docxTodoc(old_doc, new_doc)
print("=====================docx格式转换成功===================")
count_mail(new_doc, mail_extract)
else:
docxTodoc(old_doc, new_doc)
print("=====================docx格式转换成功===================")
count_mail(new_doc, mail_extract)

复制代码

icestick8586 · icestick8586

#通过docx的方式读取不到51job下载简历的情况下，可以使用如下方法：
# from django.test import TestCase
#coding:utf-8
import os,shutil,docx,re,time
from win32com import client as wc
#从所有级联目录读取文件到指定目录内
def count_files(file_dir):
count=0
for p,d,f in os.walk(file_dir):
for c in f:
if c.split('.')[-1]=="doc":
count +=1
src_dir = os.path.join(p, c)
print(src_dir)
dst_dir = file_dir + "back"
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
shutil.copy(src_dir, dst_dir)
return count
#提取每个docx简历文档里面的邮箱地址,我们这里使用python-docx模块来解决pip install python-docx
def count_mail(file_dir,dst_file):
mail_list = []
for parent,dirctiory,files in os.walk(file_dir):
for f in files:
doc = docx.Document(os.path.join(parent,f))
pattern = re.compile(r'''([a-zA-Z0-9._%+-]+@[a-zA-Z0-9\t\s.-]+(\.[a-zA-Z0-9\t\s]{2,4}))''', re.VERBOSE)
for para in doc.paragraphs:
for groups in pattern.findall(para.text):
mail_list.append(groups[0].replace(" ","")+";")
with open(dst_file,'w')as f:
f.writelines(mail_list)
print("=====================邮件信息写入成功===================")
#由于python-docx模块只能处理docx后缀，我们需要处理doc后缀的文件，必须通过win32com模块来把doc后缀转换成docx
def docxTodoc(old_doc,new_doc):
word = wc.Dispatch('Word.Application')
for parent,directory,files in os.walk(old_doc):
for f in files:
doc = word.Documents.Open(os.path.join(parent,f)) # 目标路径下的文件
new_filepath=os.path.join(new_doc,f.split(".")[0]+".docx")
print(new_filepath)
doc.SaveAs(new_filepath, 12, False, "", True, "", False, False, False,False) # 转化后路径下的文件
doc.Close()
print(time.time())
word.Quit()
def count_mail_1(file_dir,dst_file):
import zipfile
import re
mail_list = []
for parent, dirctiory, files in os.walk(file_dir):
for f in files:
z = zipfile.ZipFile(os.path.join(parent,f), "r")
text = z.read("word/document.xml").decode("utf-8")
text = re.sub(r"<.*?>", "", text) # 去除xml里的所有标记符
pattern = re.compile(r'''([a-zA-Z0-9._%+-]+@[a-zA-Z0-9\t\s.-]+(\.[a-zA-Z0-9\t\s]{2,4}))''', re.VERBOSE)
for groups in pattern.findall(text):
mail_list.append(groups[0].replace(" ", "") + ";")
with open(dst_file, 'w', encoding="utf-8")as f:
f.writelines(mail_list)
print("=====================邮件信息写入成功===================")
if __name__ == '__main__':
# print(count_files(r"C:\Users\icestick\Desktop\51job_导出简历_20180917"))
# count_mail(r"C:\Users\icestick\Desktop\new_doc",r"C:\Users\icestick\Desktop\test.txt" )
old_doc = r"C:\Users\icestick\Desktop\51job_导出简历_20180917" #需要把doc目录转成docx格式的原目录
new_doc = r"C:\Users\icestick\Desktop\new_doc" #需要把doc目录转成docx格式的目标目录
mail_extract = r"C:\Users\icestick\Desktop\test.txt" #邮箱提取好的文件
if not os.path.exists(new_doc):
os.mkdir(new_doc)
print("=====================目录创建成功======================")
docxTodoc(old_doc, new_doc)
print("=====================docx格式转换成功===================")
count_mail_1(new_doc, mail_extract)
else:
docxTodoc(old_doc, new_doc)
print("=====================docx格式转换成功===================")
count_mail_1(new_doc, mail_extract)