博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python3.5爬取cbooo.cn数据并且同步到mysql中
阅读量:7110 次
发布时间:2019-06-28

本文共 5321 字,大约阅读时间需要 17 分钟。

 

#!/usr/local/bin/python# -*- coding: utf-8 -*-# Python:                  3.5# Author:                  wucl(),zhenghai.zhang# Program:                 爬取CBO网站上所有电影的名称并写入数据库。# Version:                 0.1# History:                 2017.10.25import requests,time, pymysql, re, datetimefrom exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBodyhost = 'xxx'user = 'xxx'passwd = 'xxx'dbme = 'crawl'dbtarget = 'back_brace'table = 'movie_hotwords'tabledelta = 'movie_hotwords_delta'tablesync = 'slot_value'port = 3306tolist = ['xxx@xxx.com']def get_info():    try:        url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=1'        pData = requests.get(url).json()        return pData['tPage'], pData['tCount']    except:        print("获取总页数和总电影数失败")def get_movies(page):    try:        url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=' + str(page)        pData = requests.get(url).json()        movies_list = pData['pData']        return movies_list    except:        print('获取第%s页电影列表失败' % page)def Movie_insert(host, user, passwd, dbme, port, table, movies_list):    conn=pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")    cur=conn.cursor()    new_movies = []    punc = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.()::。·"    punctuation = punc    for movie in movies_list:        try:            movie['MovieName'] = re.sub(r"[%s]+" % punctuation, "", movie["MovieName"])            cmd = 'insert into %s(movie_id, movie_name) values("%s", "%s")' % (table, movie['ID'], movie['MovieName'])            cur.execute(cmd)            new_movies.append(movie)        except pymysql.Error:            print(" "*20, movie['MovieName'], "already exists, skip……")    cur.close()    conn.commit()    conn.close()    return new_moviesdef Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, movies_list, tablesync):    conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")    cur = conn.cursor()    cur.execute("delete from %s " % dbme+"."+tabledelta)    for movie in movies_list:        try:            cmd = 'insert into %s(movie_id, movie_name) values("%s", "%s")' % (tabledelta, movie['ID'], movie['MovieName'])            cmdsync = 'insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (dbtarget+"."+tablesync, "xxxxxx", movie['MovieName'], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"")            cur.execute(cmd)            cur.execute(cmdsync)        except pymysql.Error:            print(" " * 20, movie['MovieName'], "already exists, skip……")    try:        cmdbacktoskill = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))        cmdskilltoskillpro = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))        print(cmdbacktoskill)        cur.execute(cmdbacktoskill)        print(cmdskilltoskillpro)        cur.execute(cmdskilltoskillpro)    except pymysql.Error:        print("write into back_brace.release_task error!!!")    cur.close()    conn.commit()    conn.close()def Email(to, subject, body):    creds = Credentials(        username='xxxxxx',        password='xxxxxx')    account = Account(        primary_smtp_address='xxx@xxx.com',        credentials=creds,        autodiscover=True,        access_type=DELEGATE)    m = Message(        account=account,        subject=subject,        body=HTMLBody(body),        to_recipients=[Mailbox(email_address=to)])    m.send_and_save()if __name__ == '__main__':    update_movies = []    pages, counts = get_info()    pages = 1    for i in range(1,pages + 1):        print("*"*30,i,"*"*30)        movies_list = get_movies(i)        new_movies = Movie_insert(host, user, passwd, dbme, port, table, movies_list)        for new_movie in new_movies:            print(new_movie['MovieName'],"Added")            onemovie = {}            onemovie["ID"] = new_movie["ID"]            onemovie["MovieName"] = new_movie["MovieName"]            update_movies.append(onemovie)        time.sleep(1)    print(update_movies)    try:        Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_movies, tablesync)  # 将增加的电影写入movie_hotwords_delta表中    except:        print("Movie update and sync Error!")    subject = '本次新增电影名称'    body = "本次新增的电影名称为:

" for movie in update_movies: body += movie["MovieName"] + "
" for to in tolist: Email(to, subject, body)

 

欢迎大侠指点

 

转载地址:http://kmlhl.baihongyu.com/

你可能感兴趣的文章
centos通过screen命令恢复xshell
查看>>
Linux操作系统安装步骤
查看>>
日常日本語3
查看>>
Nginx的nginx.conf配置文件中文注释说明
查看>>
诡异的CBO参数_optimizer_use_histograms
查看>>
VMware ESXi 5.5 upgrade to 6.0 with VUM
查看>>
docker数据卷和主机共享文件
查看>>
android 40 Io编程
查看>>
MySQL服务器异常关闭,重启后,无法启动----因为磁盘空间满
查看>>
WPF疑难杂症之二(全屏幕窗口)
查看>>
实用ExtJS教程100例-007:ExtJS中Window组件最小化
查看>>
【转】灵活运用 SQL SERVER FOR XML PATH
查看>>
Windows 10 IoT Serials 4 - 如何在树莓派上使用Cortana语音助手
查看>>
Java集合---ConcurrentHashMap原理分析
查看>>
php字符串截取函数,支持中文截取
查看>>
PHP中var_dump
查看>>
看出在玩啥了吗?想不想体验下
查看>>
dedecms主从表 附加表 同时调用
查看>>
PostgreSQL的 initdb 源代码分析之十一
查看>>
《Python入门》Windows 7下Python Web开发环境搭建笔记
查看>>