import pymysql
import requests # python HTTP客户端库,编写爬虫和测试服务器响应数据会用到的类库
from bs4 import BeautifulSoup
import time
import datetime
import trace
import random
requests.packages.urllib3.disable_warnings()
requests.adapters.DEFAULT_RETRIES = 5 #增加重连次数
# 连接数据库
conn = pymysql.connect(host='localhost', user='root', password='123456', database='test', charset='utf8mb4',port=3306)
alurls=open('./import/aliyun.txt',"w",encoding="utf-8")
def index(page):
print('正在爬取数据…第{}…'.format(page))
# url网址 = 'http://www.baidu.com' (第一页)
# url = 'https://www.00tuz.com/api/discussions?include=user,lastPostedUser,tags,tags.parent,firstPost&sort=&page[offset]={}'.format(page)
url = 'https://aliyunshare.cn/api/discussions?include=user%2ClastPostedUser%2CfirstPost%2Ctags&sort&page%5Boffset%5D={}'.format(
page)
# print('-----------正在爬取第' + str(page + 1) + '页------')
# 根据url网址获取网页源码
user_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
headers = {'User-Agent': user_agent}
s=requests.session()
s.keep_alive=False
htmljson = requests.get(url, headers=headers,verify=False).json()
# print(htmljson)
info = ['id']
typeinfo = ['title']
try:
# list.append(datainfo)
# 解析 json
lists =[]
datasarry = ['data']
for data in datasarry:
# print(htmljson[data][0])
for i in info:
# print(htmljson[data][0][i],"主键id")
# print("_________分割_______")
lists.append(htmljson[data][0][i])
details(lists)
except Exception as e:
print(e)
data=str(e)
data = data.replace('"','*')
data = data.replace("'",'*')
return data
# 爬取详情页面
def details(lists):
for i in lists:
url = 'https://aliyunshare.cn/d/{}'.format(
i)
# 根据url网址获取网页源码
user_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
headers = {'User-Agent': user_agent}
html= requests.get(url, headers=headers)
print(html.status_code)
html = BeautifulSoup(html.content,'html.parser')
div= html.select('div.Post-body')
title = html.select('h2') # 标题
aurl = html.select("div.Post-body a[href^='https://www.aliyundrive.com/']") # 阿里云分享标签
otherurl=html.select("a[href^='https://alywp.net/']") # a标签
imgurl=html.select('img src') #image
# print(title)
# print(aurl)
# print(otherurl)
div1=''.join(str(i) for i in div)
str1 = ''.join(str(i) for i in title)
str2 = ''.join(str(i) for i in aurl)
# print(str1)
# print(str2)
# print(div1)
# print(div[0])
# alurls.write('第'+str(page)+'标题:'+str1+'\n'+'链接:'+str2+'\n'+'--------------------'+'\n')
#print('第'+str(page)+'\n'+'标题:'+str1+'\n'+'内容:<r>'+div1.replace('img','IMG')+'</r>'+'\n'+'--------------------'+'\n')
# print(div1.replace('img','IMG'))
# alurls.write('第'+str(page)+'\n'+'标题:'+str1+'\n'+'内容<r>'+div1+'</r>'+'\n'+'--------------------'+'\n')
#判断是否到最后获取null
# if len(str1)== 0 && str2.isspace():
# break;
mys(title,div,i)
time.sleep(4)
# mysql
def mys(title,div,sd):
try:
dic=dict(zip(title,div))
# print(dic)
for i in dic.items():
titles=''.join(str(i) for i in i[0])
contents=''.join(str(i) for i in i[1])
# print(title.replace('h2',''))
# print(content.replace('div',''))
divs="<r>"+str(contents)+"</r>"
divs=''.join(str(i) for i in divs)
# print(divs)
title=titles.replace('h2','')
divs=divs.replace('img','IMG')
divs=divs.replace('<a ','<URL ')
divs=divs.replace('</a>','</URL>')
divs=divs.replace('href=','url=')
# print(divs)
# sql='insert INTO txt(title,content) VALUES(%s,%s)'
# 插入discussions表 标题表
idinfo=(Selectid(sd))
if not idinfo is None:
print("当前已经存在正在结束")
continue
number= random.randint(99,999)
created = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 当前时间
sql='insert INTO `discussions`(title,created_at,user_id,slug,best_answer_notified,view_count,is_ids) VALUES(%s,%s,%s,%s,%s,%s,%s)'
args=(str(title),str(created),1,'',0,number,sd)
global last_id
last_id = create(sql,args)
print(last_id,'discussionsid')
postsSQL ="INSERT INTO posts(discussion_id,number,created_at,user_id,type,content,edited_user_id,hidden_at,hidden_user_id,ip_address,is_private,is_approved) " \
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
postsArgs=(last_id,1,str(created),1,'comment',str(divs),None,None,None,'127.0.0.1',0,1)
posts_id= create(postsSQL,postsArgs)
print(posts_id,"postid")
# tag对应标签
tagsql='INSERT INTO discussion_tag VALUES(%s,%s)'
tagargs= (last_id,11)
create(tagsql,tagargs)
# discussion_user 发布者
usersql='INSERT INTO discussion_user(user_id,discussion_id,last_read_at) VALUES (%s,%s,%s)'
userargs=(1,last_id,str(created))
create(usersql,userargs)
except Exception as e:
print(e)
print(trace)
#删除 discussions表对应的id数据
#deleteid('discussions',last_id)
# mysql插入
def create(sql, args):
try:
cursor= conn.cursor()
cursor.execute(sql, args)
lastId=cursor.lastrowid # 返回插入后的数据id
return lastId
except Exception as e:
print('连接错误'+e)
data=str(e)
data = data.replace('"','*')
data = data.replace("'",'*')
return data
conn.rollback()
finally:
conn.commit()
cursor.close()
# 删除单个数据
def deleteid(db,args):
"""
:param db: 表名称
:param args: id条件
:return:
"""
try:
cursor= conn.cursor()
sql="DELETE FROM {db} where id={id}".format(db=db,id=args)
cursor.execute(sql)
except Exception as e:
print('删除失败',e)
finally:
conn.commit()
cursor.close()
print('删除成功')
# 查询单个数据
def Selectid(args):
try:
cursor= conn.cursor()
sql='SELECT is_ids FROM discussions where is_ids={is_ids}'.format(is_ids=args)
cursor.execute(sql)
result= cursor.fetchone()
return result
except Exception as e:
print('查询错误',e)
conn.commit()
cursor.close()
finally:
conn.commit()
cursor.close()
# main
if __name__ == '__main__':
for page in range(312,9999):
index(page)